**SberBank Russian Housing Market***

In [1]:
#importing packages
import pandas as pd # data preprocessing
import numpy as np # linear algebra
import matplotlib.pyplot as plt #Basic plots visualisations
%matplotlib inline
import seaborn as sns #for prettier plots
import os
import pickle
import warnings
warnings.filterwarnings('ignore')
plt.style.use('fivethirtyeight')
plt.figure(figsize=(8,6))

pd.set_option('display.max_columns' , 300)
pd.set_option('display.max_rows' , 300)
<Figure size 576x432 with 0 Axes>
In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor , AdaBoostRegressor , GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error,mean_squared_error 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from xgboost.sklearn import XGBRegressor
In [3]:
import os
os.getcwd()
Out[3]:
'C:\\Users\\Hp\\Desktop\\Projects\\task\\russian housing market'
In [4]:
train_data=pd.read_csv("train_data.csv")
In [5]:
train_data.shape
Out[5]:
(30471, 292)
In [6]:
def summary(train_data):
    print('Shape of data :' ,train_data.shape )
    return(pd.DataFrame({"Datatype":train_data.dtypes ,
                          "NAs":train_data.isnull().sum() , 
                           "Uniques":train_data.nunique(),
                           "levels": [train_data[x].unique() for x in train_data.columns]}))
In [7]:
summary(train_data)
Shape of data : (30471, 292)
Out[7]:
Datatype NAs Uniques levels
id int64 0 30471 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...
timestamp object 0 1161 [2011-08-20, 2011-08-23, 2011-08-27, 2011-09-0...
full_sq int64 0 211 [43, 34, 89, 77, 67, 25, 44, 42, 36, 38, 31, 5...
life_sq float64 6383 175 [27.0, 19.0, 29.0, 50.0, 77.0, 46.0, 14.0, 44....
floor float64 167 41 [4.0, 3.0, 2.0, 9.0, 14.0, 10.0, 5.0, 12.0, 11...
max_floor float64 9572 49 [nan, 17.0, 5.0, 22.0, 16.0, 9.0, 8.0, 0.0, 24...
material float64 9572 6 [nan, 1.0, 2.0, 4.0, 6.0, 5.0, 3.0]
build_year float64 13605 119 [nan, 1907.0, 1980.0, 2014.0, 1970.0, 1982.0, ...
num_room float64 9572 13 [nan, 2.0, 1.0, 3.0, 4.0, 5.0, 6.0, 0.0, 19.0,...
kitch_sq float64 9572 74 [nan, 11.0, 12.0, 0.0, 8.0, 1.0, 6.0, 10.0, 9....
state float64 13559 5 [nan, 3.0, 1.0, 2.0, 4.0, 33.0]
product_type object 0 2 [Investment, OwnerOccupier]
sub_area object 0 146 [Bibirevo, Nagatinskij Zaton, Tekstil'shhiki, ...
area_m float64 0 146 [6407578.1, 9589336.912, 4808269.831, 12583535...
raion_popul int64 0 146 [155572, 115352, 101708, 178473, 108171, 43795...
green_zone_part float64 0 146 [0.189727117, 0.37260204399999997, 0.112559644...
indust_part float64 0 132 [6.99893e-05, 0.049637257000000004, 0.11853738...
children_preschool int64 0 146 [9576, 6880, 5879, 13087, 5706, 2418, 2459, 65...
preschool_quota float64 6688 121 [5001.0, 3119.0, 1463.0, 6839.0, 3240.0, 852.0...
preschool_education_centers_raion int64 0 13 [5, 4, 9, 7, 2, 3, 13, 8, 6, 1, 10, 11, 0]
children_school int64 0 146 [10309, 7759, 6207, 13670, 6748, 2514, 2810, 6...
school_quota float64 6685 125 [11065.0, 6237.0, 5580.0, 17063.0, 7770.0, 201...
school_education_centers_raion int64 0 14 [5, 8, 7, 10, 9, 3, 6, 4, 14, 1, 13, 11, 2, 0]
school_education_centers_top_20_raion int64 0 3 [0, 1, 2]
hospital_beds_raion float64 14441 79 [240.0, 229.0, 1183.0, nan, 562.0, 4849.0, 189...
healthcare_centers_raion int64 0 7 [1, 4, 0, 3, 2, 5, 6]
university_top_20_raion int64 0 4 [0, 2, 1, 3]
sport_objects_raion int64 0 24 [7, 6, 5, 17, 25, 4, 3, 29, 12, 16, 2, 0, 10, ...
additional_education_raion int64 0 12 [3, 1, 6, 2, 0, 16, 8, 4, 5, 11, 10, 7]
culture_objects_top_25 object 0 2 [no, yes]
culture_objects_top_25_raion int64 0 6 [0, 1, 3, 2, 4, 10]
shopping_centers_raion int64 0 16 [16, 3, 0, 11, 10, 6, 5, 7, 15, 2, 1, 9, 4, 8,...
office_raion int64 0 30 [1, 0, 4, 93, 19, 9, 7, 3, 84, 14, 2, 6, 5, 48...
thermal_power_plant_raion object 0 2 [no, yes]
incineration_raion object 0 2 [no, yes]
oil_chemistry_raion object 0 2 [no, yes]
radiation_raion object 0 2 [no, yes]
railroad_terminal_raion object 0 2 [no, yes]
big_market_raion object 0 2 [no, yes]
nuclear_reactor_raion object 0 2 [no, yes]
detention_facility_raion object 0 2 [no, yes]
full_all int64 0 146 [86206, 76284, 101982, 21155, 28179, 19940, 85...
male_f int64 0 146 [40477, 34200, 46076, 9828, 13522, 9400, 40724...
female_f int64 0 146 [45729, 42084, 55906, 11327, 14657, 10540, 452...
young_all int64 0 146 [21154, 15727, 13028, 28563, 13368, 5291, 5682...
young_male int64 0 145 [11007, 7925, 6835, 14680, 7159, 2744, 2925, 7...
young_female int64 0 145 [10147, 7802, 6193, 13883, 6209, 2547, 2757, 6...
work_all int64 0 145 [98207, 70194, 63388, 120381, 68043, 29660, 35...
work_male int64 0 145 [52277, 35622, 31813, 60040, 34236, 15793, 174...
work_female int64 0 146 [45930, 34572, 31575, 60341, 33807, 13867, 175...
ekder_all int64 0 146 [36211, 29431, 25292, 29529, 26760, 8844, 1672...
ekder_male int64 0 146 [10580, 9266, 7609, 9083, 8563, 2608, 5351, 69...
ekder_female int64 0 146 [25631, 20165, 17683, 20446, 18197, 6236, 1136...
0_6_all int64 0 146 [9576, 6880, 5879, 13087, 5706, 2418, 2459, 65...
0_6_male int64 0 144 [4899, 3466, 3095, 6645, 2982, 1224, 1241, 345...
0_6_female int64 0 145 [4677, 3414, 2784, 6442, 2724, 1194, 1218, 305...
7_14_all int64 0 146 [10309, 7759, 6207, 13670, 6748, 2514, 2810, 6...
7_14_male int64 0 142 [5463, 3909, 3269, 7126, 3664, 1328, 1472, 345...
7_14_female int64 0 145 [4846, 3850, 2938, 6544, 3084, 1186, 1338, 311...
0_17_all int64 0 145 [23603, 17700, 14884, 32063, 15237, 5866, 6510...
0_17_male int64 0 146 [12286, 8998, 7821, 16513, 8113, 3035, 3345, 8...
0_17_female int64 0 146 [11317, 8702, 7063, 15550, 7124, 2831, 3165, 7...
16_29_all int64 0 145 [17508, 15164, 19401, 3292, 5164, 4851, 19445,...
16_29_male int64 0 145 [9425, 7571, 9045, 1450, 2583, 2329, 10085, 84...
16_29_female int64 0 146 [8083, 7593, 10356, 1842, 2581, 2522, 9360, 91...
0_13_all int64 0 146 [18654, 13729, 11252, 24934, 11631, 4632, 4884...
0_13_male int64 0 144 [9709, 6929, 5916, 12782, 6223, 2399, 2507, 64...
0_13_female int64 0 146 [8945, 6800, 5336, 12152, 5408, 2233, 2377, 57...
raion_build_count_with_material_info float64 4991 112 [211.0, 245.0, 330.0, 458.0, 746.0, 188.0, 217...
build_count_block float64 4991 76 [25.0, 83.0, 59.0, 9.0, 48.0, 24.0, 23.0, 101....
build_count_wood float64 4991 34 [0.0, 1.0, 51.0, 2.0, 204.0, 793.0, 11.0, 6.0,...
build_count_frame float64 4991 21 [0.0, 12.0, 14.0, 36.0, 1.0, 97.0, 4.0, 83.0, ...
build_count_brick float64 4991 101 [0.0, 67.0, 206.0, 124.0, 643.0, 147.0, 139.0,...
build_count_monolith float64 4991 33 [2.0, 4.0, 50.0, 16.0, 12.0, 11.0, 21.0, 14.0,...
build_count_panel float64 4991 91 [184.0, 90.0, 60.0, 201.0, 35.0, 15.0, 41.0, 1...
build_count_foam float64 4991 4 [0.0, 2.0, 1.0, 11.0, nan]
build_count_slag float64 4991 21 [0.0, 1.0, 9.0, 3.0, 10.0, 64.0, 2.0, 41.0, 12...
build_count_mix float64 4991 9 [0.0, 2.0, 1.0, 9.0, 6.0, 8.0, 5.0, 3.0, 4.0, ...
raion_build_count_with_builddate_info float64 4991 114 [211.0, 244.0, 330.0, 459.0, 746.0, 188.0, 216...
build_count_before_1920 float64 4991 29 [0.0, 1.0, 13.0, 371.0, 11.0, 47.0, 298.0, 240...
build_count_1921-1945 float64 4991 47 [0.0, 1.0, 24.0, 114.0, 5.0, 38.0, 9.0, 88.0, ...
build_count_1946-1970 float64 4991 100 [0.0, 143.0, 246.0, 40.0, 146.0, 152.0, 90.0, ...
build_count_1971-1995 float64 4991 92 [206.0, 84.0, 63.0, 130.0, 62.0, 25.0, 58.0, 3...
build_count_after_1995 float64 4991 69 [5.0, 15.0, 20.0, 252.0, 53.0, 6.0, 19.0, 51.0...
ID_metro int64 0 223 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...
metro_min_avto float64 0 11843 [2.590241095, 0.936699728, 2.120998901, 1.4890...
metro_km_avto float64 0 11843 [1.131259906, 0.647336757, 1.637996285, 0.9845...
metro_min_walk float64 25 11834 [13.57511887, 7.620630407999999, 17.3515153999...
metro_km_walk float64 25 11834 [1.131259906, 0.635052534, 1.445959617, 0.9638...
kindergarten_km float64 0 11852 [0.14569955199999998, 0.147754269, 0.049101535...
school_km float64 0 11825 [0.17797535, 0.273345319, 0.158071895, 0.23645...
park_km float64 0 11852 [2.158587074, 0.550689737, 0.374847751, 0.0780...
green_zone_km float64 0 11735 [0.600973099, 0.065321162, 0.453172405, 0.1061...
industrial_km float64 0 11723 [1.080934313, 0.966479097, 0.939275144, 0.4511...
water_treatment_km float64 0 11828 [23.68346, 1.317476, 4.91266, 15.62371, 10.683...
cemetery_km float64 0 11828 [1.804127, 4.655004, 3.3810830000000003, 2.017...
incineration_km float64 0 11829 [3.6333339999999996, 8.648587, 11.99648, 14.31...
railroad_station_walk_km float64 25 11834 [5.419893032, 3.4119930839999997, 1.277658039,...
railroad_station_walk_min float64 25 11834 [65.03871639, 40.943917, 15.33189647, 51.49719...
ID_railroad_station_walk float64 25 133 [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, ...
railroad_station_avto_km float64 0 11843 [5.419893032, 3.641772591, 1.277658039, 3.8160...
railroad_station_avto_min float64 0 11843 [6.905892968, 4.679744508, 1.7014195369999998,...
ID_railroad_station_avto int64 0 133 [1, 2, 3, 4, 113, 6, 7, 9, 22, 11, 128, 13, 14...
public_transport_station_km float64 0 11851 [0.274985143, 0.065263344, 0.32875604399999997...
public_transport_station_min_walk float64 0 11852 [3.2998217139999997, 0.78316013, 3.945072522, ...
water_km float64 0 11851 [0.992631058, 0.698081318, 0.468264622, 1.2003...
water_1line object 0 2 [no, yes]
mkad_km float64 0 11852 [1.42239141, 9.503405157000001, 5.60479992, 2....
ttk_km float64 0 11852 [10.9185867, 3.1039959539999997, 2.92748709699...
sadovoe_km float64 0 11852 [13.10061764, 6.444333466000001, 6.963402995, ...
bulvar_ring_km float64 0 11852 [13.67565705, 8.132640073, 8.054252314, 18.309...
kremlin_km float64 0 11852 [15.15621058, 8.698054189, 9.067884956, 19.487...
big_road1_km float64 0 11852 [1.422391404, 2.887376585, 0.647249803, 2.6778...
ID_big_road1 int64 0 48 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...
big_road1_1line object 0 2 [no, yes]
big_road2_km float64 0 11852 [3.830951404, 3.1039959739999996, 2.927487099,...
ID_big_road2 int64 0 58 [5, 4, 17, 10, 3, 20, 36, 14, 9, 11, 1, 33, 32...
railroad_km float64 0 11852 [1.305159492, 0.694535727, 0.70069112, 1.99926...
railroad_1line object 0 2 [no, yes]
zd_vokzaly_avto_km float64 0 11843 [14.23196091, 9.242585522, 9.540544478, 17.478...
ID_railroad_terminal int64 0 8 [101, 32, 5, 83, 113, 97, 121, 50]
bus_terminal_avto_km float64 0 11843 [24.2924061, 5.706113234, 6.710302485, 6.73461...
ID_bus_terminal int64 0 14 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
oil_chemistry_km float64 0 11852 [18.152338, 9.034641872, 5.777393501000001, 27...
nuclear_reactor_km float64 0 11852 [5.718518835, 3.4899544430000002, 7.50661249, ...
radiation_km float64 0 11852 [1.210027392, 2.72429538, 0.7722161040000001, ...
power_transmission_line_km float64 0 11852 [1.0625130459999999, 1.2461487390000001, 1.602...
thermal_power_plant_km float64 0 11852 [5.814134663, 3.419574049, 3.682454651, 11.178...
ts_km float64 0 11849 [4.308127002, 0.7255604309999999, 3.562187704,...
big_market_km float64 0 11843 [10.81417151, 6.910567711000001, 5.75236835, 2...
market_shop_km float64 0 11843 [1.6762583130000002, 3.4247160919999997, 1.375...
fitness_km float64 0 11775 [0.485841388, 0.6683636789999999, 0.733101062,...
swim_pool_km float64 0 11843 [3.0650470989999996, 2.000153804, 1.239303854,...
ice_rink_km float64 0 11843 [1.1075942090000002, 8.97282283, 1.97851718699...
stadium_km float64 0 11843 [8.148590774, 6.127072782000001, 0.767568769, ...
basketball_km float64 0 11852 [3.5165129110000004, 1.161578983, 1.952770629,...
hospice_morgue_km float64 0 11852 [2.392353035, 2.543746975, 0.621357002, 3.5495...
detention_facility_km float64 0 11843 [4.2480358869999995, 12.64987875, 7.682302975,...
public_healthcare_km float64 0 11843 [0.974742843, 1.47772267, 0.097143527, 2.16373...
university_km float64 0 11843 [6.715025787, 1.852560245, 0.8412541020000001,...
workplaces_km float64 0 11844 [0.8843500209999999, 0.686251693, 1.5100888540...
shopping_centers_km float64 0 11813 [0.648487637, 0.519311324, 1.48653302, 0.59991...
office_km float64 0 11806 [0.637188832, 0.688796317, 1.5430488359999999,...
additional_education_km float64 0 11843 [0.947961657, 1.072315063, 0.391957389, 0.8926...
preschool_km float64 0 11828 [0.17797535, 0.273345319, 0.158071895, 0.23645...
big_church_km float64 0 11852 [0.625783434, 0.967820571, 3.178751487, 1.0317...
church_synagogue_km float64 0 11852 [0.628186549, 0.471446524, 0.755946015, 1.5615...
mosque_km float64 0 11852 [3.932040333, 4.841543888, 7.92215157, 15.3004...
theater_km float64 0 11843 [14.05304655, 6.829888847, 4.273200485, 16.990...
museum_km float64 0 11852 [7.389497904, 0.709260033, 3.156422843, 16.041...
exhibition_km float64 0 11852 [7.023704919, 2.358840498, 4.958214283, 5.0296...
catering_km float64 0 11852 [0.516838085, 0.23028691, 0.190461977, 0.46582...
ecology object 0 5 [good, excellent, poor, satisfactory, no data]
green_part_500 float64 0 3313 [0.0, 25.14, 1.67, 17.36, 3.56, 17.62, 7.71, 3...
prom_part_500 float64 0 2571 [0.0, 0.57, 4.44, 19.42, 15.12, 39.33, 13.12, ...
office_count_500 int64 0 30 [0, 15, 5, 3, 1, 2, 4, 8, 6, 10, 7, 25, 9, 12,...
office_sqm_500 int64 0 1070 [0, 293699, 227705, 7719, 15565, 34565, 122400...
trc_count_500 int64 0 9 [0, 1, 3, 2, 5, 4, 6, 7, 8]
trc_sqm_500 int64 0 500 [0, 45000, 102000, 8499, 3164, 420403, 7208, 3...
cafe_count_500 int64 0 95 [0, 5, 3, 2, 48, 7, 4, 1, 13, 6, 16, 20, 23, 5...
cafe_sum_500_min_price_avg float64 13281 655 [nan, 860.0, 666.67, 1000.0, 702.22, 750.0, 63...
cafe_sum_500_max_price_avg float64 13281 446 [nan, 1500.0, 1166.67, 1625.0, 1250.0, 1083.33...
cafe_avg_price_500 float64 13281 860 [nan, 1180.0, 916.67, 1250.0, 934.44, 1312.5, ...
cafe_count_500_na_price int64 0 13 [0, 3, 1, 2, 4, 7, 9, 5, 6, 10, 13, 8, 11]
cafe_count_500_price_500 int64 0 33 [0, 1, 17, 2, 3, 4, 10, 5, 8, 19, 7, 16, 9, 11...
cafe_count_500_price_1000 int64 0 32 [0, 3, 2, 10, 1, 7, 5, 6, 9, 13, 4, 8, 11, 14,...
cafe_count_500_price_1500 int64 0 29 [0, 1, 2, 11, 3, 8, 4, 16, 6, 13, 5, 10, 9, 7,...
cafe_count_500_price_2500 int64 0 21 [0, 7, 1, 2, 3, 6, 4, 14, 5, 13, 9, 11, 19, 8,...
cafe_count_500_price_4000 int64 0 14 [0, 1, 2, 4, 3, 5, 11, 8, 7, 6, 9, 10, 13, 14]
cafe_count_500_price_high int64 0 4 [0, 1, 2, 3]
big_church_count_500 int64 0 11 [0, 1, 2, 8, 3, 4, 5, 6, 10, 11, 7]
church_count_500 int64 0 15 [0, 1, 4, 2, 3, 15, 6, 5, 8, 9, 7, 10, 12, 17,...
mosque_count_500 int64 0 2 [0, 1]
leisure_count_500 int64 0 10 [0, 2, 1, 4, 3, 6, 5, 7, 9, 8]
sport_count_500 int64 0 12 [1, 0, 3, 2, 5, 4, 6, 8, 9, 7, 10, 11]
market_count_500 int64 0 5 [0, 1, 2, 3, 4]
green_part_1000 float64 0 3735 [7.36, 26.66, 4.99, 19.25, 3.34, 0.0, 14.59, 2...
prom_part_1000 float64 0 3175 [0.0, 0.07, 0.29, 10.35, 8.29, 40.27, 4.16, 6....
office_count_1000 int64 0 84 [1, 2, 0, 46, 10, 8, 19, 6, 3, 16, 4, 17, 11, ...
office_sqm_1000 int64 0 1942 [30500, 86600, 0, 11000, 420952, 275135, 15191...
trc_count_1000 int64 0 19 [3, 5, 0, 6, 1, 2, 7, 4, 8, 12, 9, 20, 13, 10,...
trc_sqm_1000 int64 0 1063 [55600, 94065, 0, 80780, 158200, 164000, 19400...
cafe_count_1000 int64 0 235 [19, 13, 9, 12, 153, 16, 10, 5, 2, 1, 4, 55, 3...
cafe_sum_1000_min_price_avg float64 6524 1627 [527.78, 615.38, 642.86, 658.33, 763.45, 883.3...
cafe_sum_1000_max_price_avg float64 6524 1054 [888.89, 1076.92, 1142.86, 1083.33, 1272.41, 1...
cafe_avg_price_1000 float64 6524 2134 [708.33, 846.15, 892.86, 870.83, 1017.93, 1150...
cafe_count_1000_na_price int64 0 28 [1, 0, 2, 8, 3, 7, 5, 4, 12, 20, 13, 14, 6, 15...
cafe_count_1000_price_500 int64 0 82 [10, 5, 0, 3, 39, 1, 4, 2, 15, 9, 7, 6, 16, 11...
cafe_count_1000_price_1000 int64 0 83 [4, 6, 5, 45, 1, 7, 2, 0, 15, 3, 8, 11, 10, 14...
cafe_count_1000_price_1500 int64 0 84 [3, 1, 2, 5, 39, 0, 11, 7, 4, 6, 23, 14, 29, 1...
cafe_count_1000_price_2500 int64 0 57 [1, 0, 19, 2, 11, 4, 5, 6, 3, 10, 17, 7, 15, 5...
cafe_count_1000_price_4000 int64 0 29 [0, 1, 2, 5, 13, 3, 6, 27, 4, 12, 10, 7, 8, 9,...
cafe_count_1000_price_high int64 0 8 [0, 1, 2, 5, 3, 6, 4, 7]
big_church_count_1000 int64 0 24 [1, 0, 7, 3, 2, 4, 6, 5, 16, 9, 8, 15, 13, 10,...
church_count_1000 int64 0 36 [2, 1, 0, 12, 4, 3, 5, 9, 6, 10, 13, 35, 11, 1...
mosque_count_1000 int64 0 2 [0, 1]
leisure_count_1000 int64 0 26 [0, 4, 6, 2, 1, 11, 5, 8, 7, 3, 9, 30, 21, 18,...
sport_count_1000 int64 0 23 [6, 2, 5, 3, 7, 1, 0, 8, 4, 10, 14, 9, 11, 13,...
market_count_1000 int64 0 7 [1, 0, 3, 2, 4, 5, 6]
green_part_1500 float64 0 3934 [14.27, 21.53, 9.92, 28.38, 4.12, 0.0, 20.5, 1...
prom_part_1500 float64 0 3332 [6.92, 7.71, 6.73, 6.57, 4.83, 50.64, 5.57, 1....
office_count_1500 int64 0 154 [3, 0, 2, 93, 18, 20, 5, 1, 38, 12, 4, 44, 6, ...
office_sqm_1500 int64 0 2639 [39554, 102910, 0, 11000, 1195735, 431090, 453...
trc_count_1500 int64 0 27 [9, 7, 1, 6, 11, 0, 5, 4, 2, 12, 13, 3, 8, 10,...
trc_sqm_1500 int64 0 1785 [171420, 127065, 2600, 89492, 445900, 186400, ...
cafe_count_1500 int64 0 374 [34, 17, 14, 23, 272, 44, 29, 15, 9, 2, 4, 13,...
cafe_sum_1500_min_price_avg float64 4199 2582 [566.67, 694.12, 516.67, 673.91, 766.8, 718.18...
cafe_sum_1500_max_price_avg float64 4199 1723 [969.7, 1205.88, 916.67, 1130.43, 1272.73, 118...
cafe_avg_price_1500 float64 4199 3330 [768.18, 950.0, 716.67, 902.17, 1019.76, 915.8...
cafe_count_1500_na_price int64 0 48 [1, 0, 2, 19, 3, 13, 4, 5, 16, 23, 6, 11, 9, 7...
cafe_count_1500_price_500 int64 0 147 [14, 6, 4, 5, 70, 3, 12, 13, 2, 0, 1, 19, 10, ...
cafe_count_1500_price_1000 int64 0 145 [11, 7, 6, 9, 74, 3, 17, 4, 1, 0, 25, 8, 5, 2,...
cafe_count_1500_price_1500 int64 0 135 [6, 1, 2, 8, 72, 4, 7, 0, 5, 32, 3, 9, 18, 12,...
cafe_count_1500_price_2500 int64 0 97 [2, 0, 1, 30, 4, 3, 29, 6, 5, 21, 8, 10, 7, 24...
cafe_count_1500_price_4000 int64 0 53 [0, 1, 6, 9, 3, 4, 32, 2, 7, 10, 45, 31, 35, 5...
cafe_count_1500_price_high int64 0 13 [0, 1, 5, 2, 4, 3, 12, 6, 9, 11, 7, 8, 10]
big_church_count_1500 int64 0 42 [1, 0, 18, 4, 2, 7, 8, 3, 5, 13, 6, 11, 9, 15,...
church_count_1500 int64 0 62 [2, 5, 4, 0, 30, 11, 3, 1, 6, 9, 8, 14, 7, 10,...
mosque_count_1500 int64 0 2 [0, 1]
leisure_count_1500 int64 0 41 [0, 4, 10, 3, 1, 2, 5, 9, 12, 8, 20, 7, 13, 18...
sport_count_1500 int64 0 34 [7, 9, 6, 14, 11, 20, 10, 1, 3, 12, 8, 4, 0, 5...
market_count_1500 int64 0 8 [1, 0, 5, 2, 3, 4, 6, 7]
green_part_2000 float64 0 3981 [11.77, 22.37, 12.99, 32.29, 4.53, 0.38, 23.45...
prom_part_2000 float64 0 3348 [15.97, 19.25, 12.75, 5.73, 5.02, 51.58, 5.25,...
office_count_2000 int64 0 226 [9, 4, 2, 149, 21, 42, 6, 12, 0, 1, 5, 58, 25,...
office_sqm_2000 int64 0 3288 [188854, 165510, 100200, 11000, 1625130, 47129...
trc_count_2000 int64 0 37 [19, 8, 7, 17, 14, 12, 0, 4, 6, 15, 9, 16, 5, ...
trc_sqm_2000 int64 0 2397 [1244891, 179065, 52550, 89492, 564843, 683945...
cafe_count_2000 int64 0 529 [36, 21, 24, 25, 483, 33, 71, 18, 22, 2, 11, 2...
cafe_sum_2000_min_price_avg float64 1725 3537 [614.29, 695.24, 563.64, 660.0, 765.93, 741.38...
cafe_sum_2000_max_price_avg float64 1725 2429 [1042.86, 1190.48, 977.27, 1120.0, 1269.23, 12...
cafe_avg_price_2000 float64 1725 4505 [828.57, 942.86, 770.45, 890.0, 1017.58, 1000....
cafe_count_2000_na_price int64 0 68 [1, 0, 2, 28, 4, 6, 14, 3, 7, 8, 11, 19, 9, 5,...
cafe_count_2000_price_500 int64 0 212 [15, 7, 8, 5, 130, 16, 2, 0, 1, 4, 36, 14, 19,...
cafe_count_2000_price_1000 int64 0 210 [11, 8, 9, 129, 13, 24, 7, 10, 1, 4, 41, 19, 1...
cafe_count_2000_price_1500 int64 0 215 [6, 3, 4, 8, 131, 17, 1, 50, 14, 9, 0, 7, 2, 1...
cafe_count_2000_price_2500 int64 0 145 [2, 1, 50, 7, 4, 0, 39, 3, 5, 18, 8, 22, 11, 1...
cafe_count_2000_price_4000 int64 0 73 [1, 0, 14, 18, 2, 6, 5, 3, 4, 12, 40, 9, 16, 2...
cafe_count_2000_price_high int64 0 17 [0, 1, 6, 2, 3, 4, 5, 10, 7, 8, 15, 11, 12, 16...
big_church_count_2000 int64 0 63 [1, 0, 35, 6, 9, 2, 8, 3, 5, 4, 23, 17, 11, 10...
church_count_2000 int64 0 95 [2, 5, 4, 1, 61, 14, 3, 6, 7, 8, 0, 10, 11, 32...
mosque_count_2000 int64 0 2 [0, 1]
leisure_count_2000 int64 0 52 [0, 4, 17, 1, 3, 6, 2, 5, 9, 14, 8, 11, 7, 47,...
sport_count_2000 int64 0 53 [10, 11, 8, 13, 21, 28, 9, 14, 1, 7, 23, 17, 1...
market_count_2000 int64 0 9 [1, 0, 5, 2, 3, 4, 7, 6, 8]
green_part_3000 float64 0 3924 [11.98, 18.07, 12.14, 20.79, 5.06, 1.82, 22.2,...
prom_part_3000 float64 0 3061 [13.55, 27.32, 26.46, 3.57, 8.62, 39.99, 7.57,...
office_count_3000 int64 0 395 [12, 8, 4, 305, 54, 72, 33, 10, 0, 2, 120, 61,...
office_sqm_3000 int64 0 4281 [251554, 821986, 110856, 167000, 3420907, 1181...
trc_count_3000 int64 0 65 [23, 14, 7, 12, 60, 29, 24, 21, 2, 11, 15, 22,...
trc_sqm_3000 int64 0 3419 [1419204, 491565, 52550, 205756, 2296870, 1059...
cafe_count_3000 int64 0 877 [68, 30, 41, 32, 1068, 120, 160, 55, 98, 56, 6...
cafe_sum_3000_min_price_avg float64 991 5163 [639.68, 631.03, 697.44, 718.75, 853.03, 737.9...
cafe_sum_3000_max_price_avg float64 991 3885 [1079.37, 1086.21, 1192.31, 1218.75, 1410.45, ...
cafe_avg_price_3000 float64 991 6087 [859.52, 858.62, 944.87, 968.75, 1131.74, 984....
cafe_count_3000_na_price int64 0 112 [5, 1, 2, 0, 63, 12, 7, 4, 11, 3, 6, 31, 10, 2...
cafe_count_3000_price_500 int64 0 360 [21, 11, 9, 5, 266, 24, 41, 36, 0, 7, 91, 29, ...
cafe_count_3000_price_1000 int64 0 358 [22, 11, 17, 14, 267, 37, 57, 15, 28, 2, 12, 9...
cafe_count_3000_price_1500 int64 0 343 [16, 4, 9, 10, 262, 35, 37, 11, 21, 12, 109, 3...
cafe_count_3000_price_2500 int64 0 244 [3, 2, 149, 11, 16, 1, 9, 0, 81, 10, 4, 8, 6, ...
cafe_count_3000_price_4000 int64 0 112 [1, 0, 57, 2, 8, 44, 3, 9, 13, 5, 4, 16, 41, 3...
cafe_count_3000_price_high int64 0 24 [0, 4, 11, 1, 2, 9, 14, 6, 10, 8, 3, 15, 16, 1...
big_church_count_3000 int64 0 101 [2, 1, 0, 70, 12, 17, 3, 4, 5, 15, 18, 6, 7, 3...
church_count_3000 int64 0 160 [4, 7, 11, 2, 121, 12, 29, 8, 9, 3, 5, 19, 22,...
mosque_count_3000 int64 0 3 [0, 1, 2]
leisure_count_3000 int64 0 80 [0, 6, 40, 2, 8, 1, 10, 4, 3, 5, 15, 16, 11, 3...
sport_count_3000 int64 0 101 [21, 19, 20, 18, 77, 31, 56, 24, 32, 4, 17, 23...
market_count_3000 int64 0 11 [1, 6, 3, 5, 7, 2, 0, 4, 8, 10, 9]
green_part_5000 float64 0 3514 [13.09, 10.26, 13.69, 14.18, 8.38, 5.92, 25.23...
prom_part_5000 float64 178 2399 [13.31, 27.47, 21.58, 3.89, 10.92, 25.79, 12.7...
office_count_5000 int64 0 725 [29, 66, 43, 8, 689, 253, 228, 24, 94, 30, 2, ...
office_sqm_5000 int64 0 5970 [807385, 2690465, 1478160, 244166, 8404624, 42...
trc_count_5000 int64 0 121 [52, 40, 35, 22, 114, 63, 49, 45, 41, 32, 20, ...
trc_sqm_5000 int64 0 5204 [4036616, 2034942, 1572990, 942180, 3503058, 2...
cafe_count_5000 int64 0 1580 [152, 177, 122, 61, 2283, 567, 635, 143, 292, ...
cafe_sum_5000_min_price_avg float64 297 7249 [708.57, 673.81, 702.68, 931.58, 853.88, 769.9...
cafe_sum_5000_max_price_avg float64 297 6305 [1185.71, 1148.81, 1196.43, 1552.63, 1411.45, ...
cafe_avg_price_5000 float64 297 7857 [947.14, 911.31, 949.55, 1242.11, 1132.66, 102...
cafe_count_5000_na_price int64 0 175 [12, 9, 10, 4, 143, 35, 34, 11, 28, 14, 1, 111...
cafe_count_5000_price_500 int64 0 610 [39, 49, 29, 7, 566, 137, 163, 37, 86, 28, 8, ...
cafe_count_5000_price_1000 int64 0 603 [48, 65, 45, 21, 578, 163, 194, 46, 81, 39, 16...
cafe_count_5000_price_1500 int64 0 599 [40, 36, 25, 15, 552, 155, 144, 69, 30, 11, 39...
cafe_count_5000_price_2500 int64 0 375 [9, 15, 10, 11, 319, 62, 81, 19, 1, 8, 13, 254...
cafe_count_5000_price_4000 int64 0 148 [4, 3, 2, 108, 14, 16, 8, 13, 1, 9, 19, 6, 0, ...
cafe_count_5000_price_high int64 0 31 [0, 1, 17, 3, 22, 2, 6, 4, 10, 11, 5, 25, 20, ...
big_church_count_5000 int64 0 152 [13, 15, 11, 4, 135, 53, 38, 18, 10, 5, 7, 57,...
church_count_5000 int64 0 251 [22, 29, 27, 4, 236, 78, 80, 18, 34, 20, 9, 15...
mosque_count_5000 int64 0 3 [1, 0, 2]
leisure_count_5000 int64 0 107 [0, 10, 4, 91, 20, 27, 3, 2, 72, 28, 1, 9, 5, ...
sport_count_5000 int64 0 216 [52, 66, 67, 26, 195, 113, 127, 47, 85, 17, 35...
market_count_5000 int64 0 22 [4, 14, 10, 3, 17, 8, 11, 1, 6, 0, 7, 9, 13, 1...
price_doc int64 0 9296 [5850000, 6000000, 5700000, 13100000, 16331452...

Data Pre-Processing

* In data Pre-Processing we will not change the data just see what the data is all about,

*how many Numerical, Categorical Variables are Present

* Percentage of Missing values and plotting the median values for NA values and given records of that column

* Plotting the numerical and Categorical columns w.r.t Target Variable

1. Seeing the Percentage Of Missing Values

In [8]:
features_with_na=[features for features in train_data.columns if train_data[features].isnull().sum()>1]

for feature in features_with_na:
    print(feature, np.round(train_data[feature].isnull().mean(), 4),  ' % missing values')
life_sq 0.2095  % missing values
floor 0.0055  % missing values
max_floor 0.3141  % missing values
material 0.3141  % missing values
build_year 0.4465  % missing values
num_room 0.3141  % missing values
kitch_sq 0.3141  % missing values
state 0.445  % missing values
preschool_quota 0.2195  % missing values
school_quota 0.2194  % missing values
hospital_beds_raion 0.4739  % missing values
raion_build_count_with_material_info 0.1638  % missing values
build_count_block 0.1638  % missing values
build_count_wood 0.1638  % missing values
build_count_frame 0.1638  % missing values
build_count_brick 0.1638  % missing values
build_count_monolith 0.1638  % missing values
build_count_panel 0.1638  % missing values
build_count_foam 0.1638  % missing values
build_count_slag 0.1638  % missing values
build_count_mix 0.1638  % missing values
raion_build_count_with_builddate_info 0.1638  % missing values
build_count_before_1920 0.1638  % missing values
build_count_1921-1945 0.1638  % missing values
build_count_1946-1970 0.1638  % missing values
build_count_1971-1995 0.1638  % missing values
build_count_after_1995 0.1638  % missing values
metro_min_walk 0.0008  % missing values
metro_km_walk 0.0008  % missing values
railroad_station_walk_km 0.0008  % missing values
railroad_station_walk_min 0.0008  % missing values
ID_railroad_station_walk 0.0008  % missing values
cafe_sum_500_min_price_avg 0.4359  % missing values
cafe_sum_500_max_price_avg 0.4359  % missing values
cafe_avg_price_500 0.4359  % missing values
cafe_sum_1000_min_price_avg 0.2141  % missing values
cafe_sum_1000_max_price_avg 0.2141  % missing values
cafe_avg_price_1000 0.2141  % missing values
cafe_sum_1500_min_price_avg 0.1378  % missing values
cafe_sum_1500_max_price_avg 0.1378  % missing values
cafe_avg_price_1500 0.1378  % missing values
cafe_sum_2000_min_price_avg 0.0566  % missing values
cafe_sum_2000_max_price_avg 0.0566  % missing values
cafe_avg_price_2000 0.0566  % missing values
cafe_sum_3000_min_price_avg 0.0325  % missing values
cafe_sum_3000_max_price_avg 0.0325  % missing values
cafe_avg_price_3000 0.0325  % missing values
prom_part_5000 0.0058  % missing values
cafe_sum_5000_min_price_avg 0.0097  % missing values
cafe_sum_5000_max_price_avg 0.0097  % missing values
cafe_avg_price_5000 0.0097  % missing values

2. Plotting w.r.t Columns Having NA Values VS Present Records

In [9]:
for feature in features_with_na:
    data = train_data.copy()

    data[feature] = np.where(data[feature].isnull(), 1, 0)
    
    
    data.groupby(feature)['price_doc'].median().plot.bar()
    plt.title(feature)
    plt.show()

3.NUMERICAL VARIABLES

* We are having different types of numerical variables

a.Numerical Continous Variables

b.Numerical Discrete Variables

c.Date variables

In [10]:
numerical_features = [feature for feature in train_data.columns if train_data[feature].dtypes != 'O']

print('Number of numerical variables: ', len(numerical_features))


train_data[numerical_features].head()
Number of numerical variables:  276
Out[10]:
id full_sq life_sq floor max_floor material build_year num_room kitch_sq state area_m raion_popul green_zone_part indust_part children_preschool preschool_quota preschool_education_centers_raion children_school school_quota school_education_centers_raion school_education_centers_top_20_raion hospital_beds_raion healthcare_centers_raion university_top_20_raion sport_objects_raion additional_education_raion culture_objects_top_25_raion shopping_centers_raion office_raion full_all male_f female_f young_all young_male young_female work_all work_male work_female ekder_all ekder_male ekder_female 0_6_all 0_6_male 0_6_female 7_14_all 7_14_male 7_14_female 0_17_all 0_17_male 0_17_female 16_29_all 16_29_male 16_29_female 0_13_all 0_13_male 0_13_female raion_build_count_with_material_info build_count_block build_count_wood build_count_frame build_count_brick build_count_monolith build_count_panel build_count_foam build_count_slag build_count_mix raion_build_count_with_builddate_info build_count_before_1920 build_count_1921-1945 build_count_1946-1970 build_count_1971-1995 build_count_after_1995 ID_metro metro_min_avto metro_km_avto metro_min_walk metro_km_walk kindergarten_km school_km park_km green_zone_km industrial_km water_treatment_km cemetery_km incineration_km railroad_station_walk_km railroad_station_walk_min ID_railroad_station_walk railroad_station_avto_km railroad_station_avto_min ID_railroad_station_avto public_transport_station_km public_transport_station_min_walk water_km mkad_km ttk_km sadovoe_km bulvar_ring_km kremlin_km big_road1_km ID_big_road1 big_road2_km ID_big_road2 railroad_km zd_vokzaly_avto_km ID_railroad_terminal bus_terminal_avto_km ID_bus_terminal oil_chemistry_km nuclear_reactor_km radiation_km power_transmission_line_km thermal_power_plant_km ts_km big_market_km market_shop_km fitness_km swim_pool_km ice_rink_km stadium_km basketball_km hospice_morgue_km detention_facility_km public_healthcare_km university_km workplaces_km shopping_centers_km office_km additional_education_km preschool_km big_church_km church_synagogue_km mosque_km theater_km museum_km exhibition_km catering_km green_part_500 prom_part_500 office_count_500 office_sqm_500 trc_count_500 trc_sqm_500 cafe_count_500 cafe_sum_500_min_price_avg cafe_sum_500_max_price_avg cafe_avg_price_500 cafe_count_500_na_price cafe_count_500_price_500 cafe_count_500_price_1000 cafe_count_500_price_1500 cafe_count_500_price_2500 cafe_count_500_price_4000 cafe_count_500_price_high big_church_count_500 church_count_500 mosque_count_500 leisure_count_500 sport_count_500 market_count_500 green_part_1000 prom_part_1000 office_count_1000 office_sqm_1000 trc_count_1000 trc_sqm_1000 cafe_count_1000 cafe_sum_1000_min_price_avg cafe_sum_1000_max_price_avg cafe_avg_price_1000 cafe_count_1000_na_price cafe_count_1000_price_500 cafe_count_1000_price_1000 cafe_count_1000_price_1500 cafe_count_1000_price_2500 cafe_count_1000_price_4000 cafe_count_1000_price_high big_church_count_1000 church_count_1000 mosque_count_1000 leisure_count_1000 sport_count_1000 market_count_1000 green_part_1500 prom_part_1500 office_count_1500 office_sqm_1500 trc_count_1500 trc_sqm_1500 cafe_count_1500 cafe_sum_1500_min_price_avg cafe_sum_1500_max_price_avg cafe_avg_price_1500 cafe_count_1500_na_price cafe_count_1500_price_500 cafe_count_1500_price_1000 cafe_count_1500_price_1500 cafe_count_1500_price_2500 cafe_count_1500_price_4000 cafe_count_1500_price_high big_church_count_1500 church_count_1500 mosque_count_1500 leisure_count_1500 sport_count_1500 market_count_1500 green_part_2000 prom_part_2000 office_count_2000 office_sqm_2000 trc_count_2000 trc_sqm_2000 cafe_count_2000 cafe_sum_2000_min_price_avg cafe_sum_2000_max_price_avg cafe_avg_price_2000 cafe_count_2000_na_price cafe_count_2000_price_500 cafe_count_2000_price_1000 cafe_count_2000_price_1500 cafe_count_2000_price_2500 cafe_count_2000_price_4000 cafe_count_2000_price_high big_church_count_2000 church_count_2000 mosque_count_2000 leisure_count_2000 sport_count_2000 market_count_2000 green_part_3000 prom_part_3000 office_count_3000 office_sqm_3000 trc_count_3000 trc_sqm_3000 cafe_count_3000 cafe_sum_3000_min_price_avg cafe_sum_3000_max_price_avg cafe_avg_price_3000 cafe_count_3000_na_price cafe_count_3000_price_500 cafe_count_3000_price_1000 cafe_count_3000_price_1500 cafe_count_3000_price_2500 cafe_count_3000_price_4000 cafe_count_3000_price_high big_church_count_3000 church_count_3000 mosque_count_3000 leisure_count_3000 sport_count_3000 market_count_3000 green_part_5000 prom_part_5000 office_count_5000 office_sqm_5000 trc_count_5000 trc_sqm_5000 cafe_count_5000 cafe_sum_5000_min_price_avg cafe_sum_5000_max_price_avg cafe_avg_price_5000 cafe_count_5000_na_price cafe_count_5000_price_500 cafe_count_5000_price_1000 cafe_count_5000_price_1500 cafe_count_5000_price_2500 cafe_count_5000_price_4000 cafe_count_5000_price_high big_church_count_5000 church_count_5000 mosque_count_5000 leisure_count_5000 sport_count_5000 market_count_5000 price_doc
0 1 43 27.0 4.0 NaN NaN NaN NaN NaN NaN 6.407578e+06 155572 0.189727 0.000070 9576 5001.0 5 10309 11065.0 5 0 240.0 1 0 7 3 0 16 1 86206 40477 45729 21154 11007 10147 98207 52277 45930 36211 10580 25631 9576 4899 4677 10309 5463 4846 23603 12286 11317 17508 9425 8083 18654 9709 8945 211.0 25.0 0.0 0.0 0.0 2.0 184.0 0.0 0.0 0.0 211.0 0.0 0.0 0.0 206.0 5.0 1 2.590241 1.131260 13.575119 1.131260 0.145700 0.177975 2.158587 0.600973 1.080934 23.683460 1.804127 3.633334 5.419893 65.038716 1.0 5.419893 6.905893 1 0.274985 3.299822 0.992631 1.422391 10.918587 13.100618 13.675657 15.156211 1.422391 1 3.830951 5 1.305159 14.231961 101 24.292406 1 18.152338 5.718519 1.210027 1.062513 5.814135 4.308127 10.814172 1.676258 0.485841 3.065047 1.107594 8.148591 3.516513 2.392353 4.248036 0.974743 6.715026 0.884350 0.648488 0.637189 0.947962 0.177975 0.625783 0.628187 3.932040 14.053047 7.389498 7.023705 0.516838 0.00 0.00 0 0 0 0 0 NaN NaN NaN 0 0 0 0 0 0 0 0 0 0 0 1 0 7.36 0.00 1 30500 3 55600 19 527.78 888.89 708.33 1 10 4 3 1 0 0 1 2 0 0 6 1 14.27 6.92 3 39554 9 171420 34 566.67 969.70 768.18 1 14 11 6 2 0 0 1 2 0 0 7 1 11.77 15.97 9 188854 19 1244891 36 614.29 1042.86 828.57 1 15 11 6 2 1 0 1 2 0 0 10 1 11.98 13.55 12 251554 23 1419204 68 639.68 1079.37 859.52 5 21 22 16 3 1 0 2 4 0 0 21 1 13.09 13.31 29 807385 52 4036616 152 708.57 1185.71 947.14 12 39 48 40 9 4 0 13 22 1 0 52 4 5850000
1 2 34 19.0 3.0 NaN NaN NaN NaN NaN NaN 9.589337e+06 115352 0.372602 0.049637 6880 3119.0 5 7759 6237.0 8 0 229.0 1 0 6 1 1 3 0 76284 34200 42084 15727 7925 7802 70194 35622 34572 29431 9266 20165 6880 3466 3414 7759 3909 3850 17700 8998 8702 15164 7571 7593 13729 6929 6800 245.0 83.0 1.0 0.0 67.0 4.0 90.0 0.0 0.0 0.0 244.0 1.0 1.0 143.0 84.0 15.0 2 0.936700 0.647337 7.620630 0.635053 0.147754 0.273345 0.550690 0.065321 0.966479 1.317476 4.655004 8.648587 3.411993 40.943917 2.0 3.641773 4.679745 2 0.065263 0.783160 0.698081 9.503405 3.103996 6.444333 8.132640 8.698054 2.887377 2 3.103996 4 0.694536 9.242586 32 5.706113 2 9.034642 3.489954 2.724295 1.246149 3.419574 0.725560 6.910568 3.424716 0.668364 2.000154 8.972823 6.127073 1.161579 2.543747 12.649879 1.477723 1.852560 0.686252 0.519311 0.688796 1.072315 0.273345 0.967821 0.471447 4.841544 6.829889 0.709260 2.358840 0.230287 25.14 0.00 0 0 0 0 5 860.00 1500.00 1180.00 0 1 3 0 0 1 0 0 1 0 0 0 0 26.66 0.07 2 86600 5 94065 13 615.38 1076.92 846.15 0 5 6 1 0 1 0 1 2 0 4 2 0 21.53 7.71 3 102910 7 127065 17 694.12 1205.88 950.00 0 6 7 1 2 1 0 1 5 0 4 9 0 22.37 19.25 4 165510 8 179065 21 695.24 1190.48 942.86 0 7 8 3 2 1 0 1 5 0 4 11 0 18.07 27.32 12 821986 14 491565 30 631.03 1086.21 858.62 1 11 11 4 2 1 0 1 7 0 6 19 1 10.26 27.47 66 2690465 40 2034942 177 673.81 1148.81 911.31 9 49 65 36 15 3 0 15 29 1 10 66 14 6000000
2 3 43 29.0 2.0 NaN NaN NaN NaN NaN NaN 4.808270e+06 101708 0.112560 0.118537 5879 1463.0 4 6207 5580.0 7 0 1183.0 1 0 5 1 0 0 1 101982 46076 55906 13028 6835 6193 63388 31813 31575 25292 7609 17683 5879 3095 2784 6207 3269 2938 14884 7821 7063 19401 9045 10356 11252 5916 5336 330.0 59.0 0.0 0.0 206.0 4.0 60.0 0.0 1.0 0.0 330.0 1.0 0.0 246.0 63.0 20.0 3 2.120999 1.637996 17.351515 1.445960 0.049102 0.158072 0.374848 0.453172 0.939275 4.912660 3.381083 11.996480 1.277658 15.331896 3.0 1.277658 1.701420 3 0.328756 3.945073 0.468265 5.604800 2.927487 6.963403 8.054252 9.067885 0.647250 3 2.927487 4 0.700691 9.540544 5 6.710302 3 5.777394 7.506612 0.772216 1.602183 3.682455 3.562188 5.752368 1.375443 0.733101 1.239304 1.978517 0.767569 1.952771 0.621357 7.682303 0.097144 0.841254 1.510089 1.486533 1.543049 0.391957 0.158072 3.178751 0.755946 7.922152 4.273200 3.156423 4.958214 0.190462 1.67 0.00 0 0 0 0 3 666.67 1166.67 916.67 0 0 2 1 0 0 0 0 0 0 0 0 0 4.99 0.29 0 0 0 0 9 642.86 1142.86 892.86 2 0 5 2 0 0 0 0 1 0 0 5 3 9.92 6.73 0 0 1 2600 14 516.67 916.67 716.67 2 4 6 2 0 0 0 0 4 0 0 6 5 12.99 12.75 4 100200 7 52550 24 563.64 977.27 770.45 2 8 9 4 1 0 0 0 4 0 0 8 5 12.14 26.46 8 110856 7 52550 41 697.44 1192.31 944.87 2 9 17 9 3 1 0 0 11 0 0 20 6 13.69 21.58 43 1478160 35 1572990 122 702.68 1196.43 949.55 10 29 45 25 10 3 0 11 27 0 4 67 10 5700000
3 4 89 50.0 9.0 NaN NaN NaN NaN NaN NaN 1.258354e+07 178473 0.194703 0.069753 13087 6839.0 9 13670 17063.0 10 0 NaN 1 0 17 6 0 11 4 21155 9828 11327 28563 14680 13883 120381 60040 60341 29529 9083 20446 13087 6645 6442 13670 7126 6544 32063 16513 15550 3292 1450 1842 24934 12782 12152 458.0 9.0 51.0 12.0 124.0 50.0 201.0 0.0 9.0 2.0 459.0 13.0 24.0 40.0 130.0 252.0 4 1.489049 0.984537 11.565624 0.963802 0.179441 0.236455 0.078090 0.106125 0.451173 15.623710 2.017080 14.317640 4.291432 51.497190 4.0 3.816045 5.271136 4 0.131597 1.579164 1.200336 2.677824 14.606501 17.457198 18.309433 19.487005 2.677824 1 2.780449 17 1.999265 17.478380 83 6.734618 1 27.667863 9.522538 6.348716 1.767612 11.178333 0.583025 27.892717 0.811275 0.623484 1.950317 6.483172 7.385521 4.923843 3.549558 8.789894 2.163735 10.903161 0.622272 0.599914 0.934273 0.892674 0.236455 1.031777 1.561505 15.300449 16.990677 16.041521 5.029696 0.465820 17.36 0.57 0 0 0 0 2 1000.00 1500.00 1250.00 0 0 0 2 0 0 0 0 0 0 0 0 0 19.25 10.35 1 11000 6 80780 12 658.33 1083.33 870.83 0 3 4 5 0 0 0 0 0 0 0 3 1 28.38 6.57 2 11000 7 89492 23 673.91 1130.43 902.17 0 5 9 8 1 0 0 1 0 0 0 9 2 32.29 5.73 2 11000 7 89492 25 660.00 1120.00 890.00 0 5 11 8 1 0 0 1 1 0 0 13 2 20.79 3.57 4 167000 12 205756 32 718.75 1218.75 968.75 0 5 14 10 3 0 0 1 2 0 0 18 3 14.18 3.89 8 244166 22 942180 61 931.58 1552.63 1242.11 4 7 21 15 11 2 1 4 4 0 0 26 3 13100000
4 5 77 77.0 4.0 NaN NaN NaN NaN NaN NaN 8.398461e+06 108171 0.015234 0.037316 5706 3240.0 7 6748 7770.0 9 0 562.0 4 2 25 2 0 10 93 28179 13522 14657 13368 7159 6209 68043 34236 33807 26760 8563 18197 5706 2982 2724 6748 3664 3084 15237 8113 7124 5164 2583 2581 11631 6223 5408 746.0 48.0 0.0 0.0 643.0 16.0 35.0 0.0 3.0 1.0 746.0 371.0 114.0 146.0 62.0 53.0 5 1.257186 0.876620 8.266305 0.688859 0.247901 0.376838 0.258289 0.236214 0.392871 10.683540 2.936581 11.903910 0.853960 10.247521 5.0 1.595898 2.156284 113 0.071480 0.857764 0.820294 11.616653 1.721834 0.046810 0.787593 2.578671 1.721834 4 3.133531 10 0.084113 1.595898 113 1.423428 4 6.515857 8.671016 1.638318 3.632640 4.587917 2.609420 9.155057 1.969738 0.220288 2.544696 3.975401 3.610754 0.307915 1.864637 3.779781 1.121703 0.991683 0.892668 0.429052 0.077901 0.810801 0.376838 0.378756 0.121681 2.584370 1.112486 1.800125 1.339652 0.026102 3.56 4.44 15 293699 1 45000 48 702.22 1166.67 934.44 3 17 10 11 7 0 0 1 4 0 2 3 0 3.34 8.29 46 420952 3 158200 153 763.45 1272.41 1017.93 8 39 45 39 19 2 1 7 12 0 6 7 0 4.12 4.83 93 1195735 9 445900 272 766.80 1272.73 1019.76 19 70 74 72 30 6 1 18 30 0 10 14 2 4.53 5.02 149 1625130 17 564843 483 765.93 1269.23 1017.58 28 130 129 131 50 14 1 35 61 0 17 21 3 5.06 8.62 305 3420907 60 2296870 1068 853.03 1410.45 1131.74 63 266 267 262 149 57 4 70 121 1 40 77 5 8.38 10.92 689 8404624 114 3503058 2283 853.88 1411.45 1132.66 143 566 578 552 319 108 17 135 236 2 91 195 14 16331452

C. Date variables Present or not

In [11]:
year_feature = [feature for feature in numerical_features if 'Yr' in feature or 'Year' in feature or "date" in feature]

year_feature
Out[11]:
['raion_build_count_with_builddate_info']
In [12]:
train_data["raion_build_count_with_builddate_info"]
Out[12]:
0        211.0
1        244.0
2        330.0
3        459.0
4        746.0
         ...  
30466    282.0
30467    650.0
30468      NaN
30469    186.0
30470    303.0
Name: raion_build_count_with_builddate_info, Length: 30471, dtype: float64

B. Numerical Discrete Variables

In [13]:
discrete_feature=[feature for feature in numerical_features if len(train_data[feature].unique())<25 and feature not in year_feature+['id']]
print("Discrete Variables Count: {}".format(len(discrete_feature)))
Discrete Variables Count: 46
In [14]:
for feature in discrete_feature:
    data=train_data.copy()
    data.groupby(feature)['price_doc'].median().plot.bar()
    plt.xlabel(feature)
    plt.ylabel('price_doc')
    plt.title(feature)
    plt.show()

A. Numerical Continous Variables

In [15]:
continuous_feature=[feature for feature in numerical_features if feature not in discrete_feature+year_feature+['id']]
print("Continuous feature Count {}".format(len(continuous_feature)))
Continuous feature Count 228
In [16]:
for feature in continuous_feature:
    data=train_data.copy()
    data[feature].hist(bins=25)
    plt.xlabel(feature)
    plt.ylabel("Count")
    plt.title(feature)
    plt.show()

Applying Log

In [17]:
for feature in continuous_feature:
    data=train_data.copy()
    if 0 in data[feature].unique():
        pass
    else:
        data[feature]=np.log(data[feature])
        data['price_doc']=np.log(data['price_doc'])
        plt.scatter(data[feature],data['price_doc'])
        plt.xlabel(feature)
        plt.ylabel('price_doc')
        plt.title(feature)
        plt.show()

Finding Outliers

In [18]:
for feature in continuous_feature:
    data=train_data.copy()
    if 0 in data[feature].unique():
        pass
    else:
        data[feature]=np.log(data[feature])
        data.boxplot(column=feature)
        plt.ylabel(feature)
        plt.title(feature)
        plt.show()

CATEGORICAL VARIABLES

In [19]:
categorical_features=[feature for feature in train_data.columns if train_data[feature].dtypes=='O']
len(categorical_features)
Out[19]:
16
In [20]:
for feature in categorical_features:
    print('The feature is {} and number of categories are {}'.format(feature,len(train_data[feature].unique())))
The feature is timestamp and number of categories are 1161
The feature is product_type and number of categories are 2
The feature is sub_area and number of categories are 146
The feature is culture_objects_top_25 and number of categories are 2
The feature is thermal_power_plant_raion and number of categories are 2
The feature is incineration_raion and number of categories are 2
The feature is oil_chemistry_raion and number of categories are 2
The feature is radiation_raion and number of categories are 2
The feature is railroad_terminal_raion and number of categories are 2
The feature is big_market_raion and number of categories are 2
The feature is nuclear_reactor_raion and number of categories are 2
The feature is detention_facility_raion and number of categories are 2
The feature is water_1line and number of categories are 2
The feature is big_road1_1line and number of categories are 2
The feature is railroad_1line and number of categories are 2
The feature is ecology and number of categories are 5
In [21]:
for feature in categorical_features:
    data=train_data.copy()
    data.groupby(feature)['price_doc'].median().plot.bar()
    plt.xlabel(feature)
    plt.ylabel('price_doc')
    plt.title(feature)
    plt.show()

FEATURE ENGINEERING

In [22]:
train_data["timestamp"]
Out[22]:
0        2011-08-20
1        2011-08-23
2        2011-08-27
3        2011-09-01
4        2011-09-05
            ...    
30466    2015-06-30
30467    2015-06-30
30468    2015-06-30
30469    2015-06-30
30470    2015-06-30
Name: timestamp, Length: 30471, dtype: object
In [23]:
train_data['date']=pd.to_datetime(train_data.timestamp)
In [24]:
train_data.drop("timestamp",axis=1,inplace=True)
In [25]:
train_data['year']=train_data.date.dt.year
In [26]:
train_data['month']=train_data.date.dt.month
In [27]:
train_data['day']=train_data.date.dt.day
In [28]:
train_data.drop("date",axis=1,inplace=True)
In [29]:
train_data.dtypes
Out[29]:
id                                         int64
full_sq                                    int64
life_sq                                  float64
floor                                    float64
max_floor                                float64
material                                 float64
build_year                               float64
num_room                                 float64
kitch_sq                                 float64
state                                    float64
product_type                              object
sub_area                                  object
area_m                                   float64
raion_popul                                int64
green_zone_part                          float64
indust_part                              float64
children_preschool                         int64
preschool_quota                          float64
preschool_education_centers_raion          int64
children_school                            int64
school_quota                             float64
school_education_centers_raion             int64
school_education_centers_top_20_raion      int64
hospital_beds_raion                      float64
healthcare_centers_raion                   int64
university_top_20_raion                    int64
sport_objects_raion                        int64
additional_education_raion                 int64
culture_objects_top_25                    object
culture_objects_top_25_raion               int64
shopping_centers_raion                     int64
office_raion                               int64
thermal_power_plant_raion                 object
incineration_raion                        object
oil_chemistry_raion                       object
radiation_raion                           object
railroad_terminal_raion                   object
big_market_raion                          object
nuclear_reactor_raion                     object
detention_facility_raion                  object
full_all                                   int64
male_f                                     int64
female_f                                   int64
young_all                                  int64
young_male                                 int64
young_female                               int64
work_all                                   int64
work_male                                  int64
work_female                                int64
ekder_all                                  int64
ekder_male                                 int64
ekder_female                               int64
0_6_all                                    int64
0_6_male                                   int64
0_6_female                                 int64
7_14_all                                   int64
7_14_male                                  int64
7_14_female                                int64
0_17_all                                   int64
0_17_male                                  int64
0_17_female                                int64
16_29_all                                  int64
16_29_male                                 int64
16_29_female                               int64
0_13_all                                   int64
0_13_male                                  int64
0_13_female                                int64
raion_build_count_with_material_info     float64
build_count_block                        float64
build_count_wood                         float64
build_count_frame                        float64
build_count_brick                        float64
build_count_monolith                     float64
build_count_panel                        float64
build_count_foam                         float64
build_count_slag                         float64
build_count_mix                          float64
raion_build_count_with_builddate_info    float64
build_count_before_1920                  float64
build_count_1921-1945                    float64
build_count_1946-1970                    float64
build_count_1971-1995                    float64
build_count_after_1995                   float64
ID_metro                                   int64
metro_min_avto                           float64
metro_km_avto                            float64
metro_min_walk                           float64
metro_km_walk                            float64
kindergarten_km                          float64
school_km                                float64
park_km                                  float64
green_zone_km                            float64
industrial_km                            float64
water_treatment_km                       float64
cemetery_km                              float64
incineration_km                          float64
railroad_station_walk_km                 float64
railroad_station_walk_min                float64
ID_railroad_station_walk                 float64
railroad_station_avto_km                 float64
railroad_station_avto_min                float64
ID_railroad_station_avto                   int64
public_transport_station_km              float64
public_transport_station_min_walk        float64
water_km                                 float64
water_1line                               object
mkad_km                                  float64
ttk_km                                   float64
sadovoe_km                               float64
bulvar_ring_km                           float64
kremlin_km                               float64
big_road1_km                             float64
ID_big_road1                               int64
big_road1_1line                           object
big_road2_km                             float64
ID_big_road2                               int64
railroad_km                              float64
railroad_1line                            object
zd_vokzaly_avto_km                       float64
ID_railroad_terminal                       int64
bus_terminal_avto_km                     float64
ID_bus_terminal                            int64
oil_chemistry_km                         float64
nuclear_reactor_km                       float64
radiation_km                             float64
power_transmission_line_km               float64
thermal_power_plant_km                   float64
ts_km                                    float64
big_market_km                            float64
market_shop_km                           float64
fitness_km                               float64
swim_pool_km                             float64
ice_rink_km                              float64
stadium_km                               float64
basketball_km                            float64
hospice_morgue_km                        float64
detention_facility_km                    float64
public_healthcare_km                     float64
university_km                            float64
workplaces_km                            float64
shopping_centers_km                      float64
office_km                                float64
additional_education_km                  float64
preschool_km                             float64
big_church_km                            float64
church_synagogue_km                      float64
mosque_km                                float64
theater_km                               float64
museum_km                                float64
exhibition_km                            float64
catering_km                              float64
ecology                                   object
green_part_500                           float64
prom_part_500                            float64
office_count_500                           int64
office_sqm_500                             int64
trc_count_500                              int64
trc_sqm_500                                int64
cafe_count_500                             int64
cafe_sum_500_min_price_avg               float64
cafe_sum_500_max_price_avg               float64
cafe_avg_price_500                       float64
cafe_count_500_na_price                    int64
cafe_count_500_price_500                   int64
cafe_count_500_price_1000                  int64
cafe_count_500_price_1500                  int64
cafe_count_500_price_2500                  int64
cafe_count_500_price_4000                  int64
cafe_count_500_price_high                  int64
big_church_count_500                       int64
church_count_500                           int64
mosque_count_500                           int64
leisure_count_500                          int64
sport_count_500                            int64
market_count_500                           int64
green_part_1000                          float64
prom_part_1000                           float64
office_count_1000                          int64
office_sqm_1000                            int64
trc_count_1000                             int64
trc_sqm_1000                               int64
cafe_count_1000                            int64
cafe_sum_1000_min_price_avg              float64
cafe_sum_1000_max_price_avg              float64
cafe_avg_price_1000                      float64
cafe_count_1000_na_price                   int64
cafe_count_1000_price_500                  int64
cafe_count_1000_price_1000                 int64
cafe_count_1000_price_1500                 int64
cafe_count_1000_price_2500                 int64
cafe_count_1000_price_4000                 int64
cafe_count_1000_price_high                 int64
big_church_count_1000                      int64
church_count_1000                          int64
mosque_count_1000                          int64
leisure_count_1000                         int64
sport_count_1000                           int64
market_count_1000                          int64
green_part_1500                          float64
prom_part_1500                           float64
office_count_1500                          int64
office_sqm_1500                            int64
trc_count_1500                             int64
trc_sqm_1500                               int64
cafe_count_1500                            int64
cafe_sum_1500_min_price_avg              float64
cafe_sum_1500_max_price_avg              float64
cafe_avg_price_1500                      float64
cafe_count_1500_na_price                   int64
cafe_count_1500_price_500                  int64
cafe_count_1500_price_1000                 int64
cafe_count_1500_price_1500                 int64
cafe_count_1500_price_2500                 int64
cafe_count_1500_price_4000                 int64
cafe_count_1500_price_high                 int64
big_church_count_1500                      int64
church_count_1500                          int64
mosque_count_1500                          int64
leisure_count_1500                         int64
sport_count_1500                           int64
market_count_1500                          int64
green_part_2000                          float64
prom_part_2000                           float64
office_count_2000                          int64
office_sqm_2000                            int64
trc_count_2000                             int64
trc_sqm_2000                               int64
cafe_count_2000                            int64
cafe_sum_2000_min_price_avg              float64
cafe_sum_2000_max_price_avg              float64
cafe_avg_price_2000                      float64
cafe_count_2000_na_price                   int64
cafe_count_2000_price_500                  int64
cafe_count_2000_price_1000                 int64
cafe_count_2000_price_1500                 int64
cafe_count_2000_price_2500                 int64
cafe_count_2000_price_4000                 int64
cafe_count_2000_price_high                 int64
big_church_count_2000                      int64
church_count_2000                          int64
mosque_count_2000                          int64
leisure_count_2000                         int64
sport_count_2000                           int64
market_count_2000                          int64
green_part_3000                          float64
prom_part_3000                           float64
office_count_3000                          int64
office_sqm_3000                            int64
trc_count_3000                             int64
trc_sqm_3000                               int64
cafe_count_3000                            int64
cafe_sum_3000_min_price_avg              float64
cafe_sum_3000_max_price_avg              float64
cafe_avg_price_3000                      float64
cafe_count_3000_na_price                   int64
cafe_count_3000_price_500                  int64
cafe_count_3000_price_1000                 int64
cafe_count_3000_price_1500                 int64
cafe_count_3000_price_2500                 int64
cafe_count_3000_price_4000                 int64
cafe_count_3000_price_high                 int64
big_church_count_3000                      int64
church_count_3000                          int64
mosque_count_3000                          int64
leisure_count_3000                         int64
sport_count_3000                           int64
market_count_3000                          int64
green_part_5000                          float64
prom_part_5000                           float64
office_count_5000                          int64
office_sqm_5000                            int64
trc_count_5000                             int64
trc_sqm_5000                               int64
cafe_count_5000                            int64
cafe_sum_5000_min_price_avg              float64
cafe_sum_5000_max_price_avg              float64
cafe_avg_price_5000                      float64
cafe_count_5000_na_price                   int64
cafe_count_5000_price_500                  int64
cafe_count_5000_price_1000                 int64
cafe_count_5000_price_1500                 int64
cafe_count_5000_price_2500                 int64
cafe_count_5000_price_4000                 int64
cafe_count_5000_price_high                 int64
big_church_count_5000                      int64
church_count_5000                          int64
mosque_count_5000                          int64
leisure_count_5000                         int64
sport_count_5000                           int64
market_count_5000                          int64
price_doc                                  int64
year                                       int64
month                                      int64
day                                        int64
dtype: object
In [30]:
test_data=pd.read_csv("test_data.csv")
In [31]:
test_data["timestamp"][0]
Out[31]:
'2015-07-01'
In [32]:
test_data['date']=pd.to_datetime(test_data.timestamp)
In [33]:
test_data.drop("timestamp",axis=1,inplace=True)
In [34]:
test_data['year']=test_data.date.dt.year
In [35]:
test_data['month']=test_data.date.dt.month
In [36]:
test_data['day']=test_data.date.dt.day
In [37]:
test_data.drop("date",axis=1,inplace=True)
In [38]:
test_data.dtypes
Out[38]:
id                                         int64
full_sq                                  float64
life_sq                                  float64
floor                                      int64
max_floor                                  int64
material                                   int64
build_year                               float64
num_room                                   int64
kitch_sq                                 float64
state                                    float64
product_type                              object
sub_area                                  object
area_m                                   float64
raion_popul                                int64
green_zone_part                          float64
indust_part                              float64
children_preschool                         int64
preschool_quota                          float64
preschool_education_centers_raion          int64
children_school                            int64
school_quota                             float64
school_education_centers_raion             int64
school_education_centers_top_20_raion      int64
hospital_beds_raion                      float64
healthcare_centers_raion                   int64
university_top_20_raion                    int64
sport_objects_raion                        int64
additional_education_raion                 int64
culture_objects_top_25                    object
culture_objects_top_25_raion               int64
shopping_centers_raion                     int64
office_raion                               int64
thermal_power_plant_raion                 object
incineration_raion                        object
oil_chemistry_raion                       object
radiation_raion                           object
railroad_terminal_raion                   object
big_market_raion                          object
nuclear_reactor_raion                     object
detention_facility_raion                  object
full_all                                   int64
male_f                                     int64
female_f                                   int64
young_all                                  int64
young_male                                 int64
young_female                               int64
work_all                                   int64
work_male                                  int64
work_female                                int64
ekder_all                                  int64
ekder_male                                 int64
ekder_female                               int64
0_6_all                                    int64
0_6_male                                   int64
0_6_female                                 int64
7_14_all                                   int64
7_14_male                                  int64
7_14_female                                int64
0_17_all                                   int64
0_17_male                                  int64
0_17_female                                int64
16_29_all                                  int64
16_29_male                                 int64
16_29_female                               int64
0_13_all                                   int64
0_13_male                                  int64
0_13_female                                int64
raion_build_count_with_material_info     float64
build_count_block                        float64
build_count_wood                         float64
build_count_frame                        float64
build_count_brick                        float64
build_count_monolith                     float64
build_count_panel                        float64
build_count_foam                         float64
build_count_slag                         float64
build_count_mix                          float64
raion_build_count_with_builddate_info    float64
build_count_before_1920                  float64
build_count_1921-1945                    float64
build_count_1946-1970                    float64
build_count_1971-1995                    float64
build_count_after_1995                   float64
ID_metro                                   int64
metro_min_avto                           float64
metro_km_avto                            float64
metro_min_walk                           float64
metro_km_walk                            float64
kindergarten_km                          float64
school_km                                float64
park_km                                  float64
green_zone_km                            float64
industrial_km                            float64
water_treatment_km                       float64
cemetery_km                              float64
incineration_km                          float64
railroad_station_walk_km                 float64
railroad_station_walk_min                float64
ID_railroad_station_walk                 float64
railroad_station_avto_km                 float64
railroad_station_avto_min                float64
ID_railroad_station_avto                   int64
public_transport_station_km              float64
public_transport_station_min_walk        float64
water_km                                 float64
water_1line                               object
mkad_km                                  float64
ttk_km                                   float64
sadovoe_km                               float64
bulvar_ring_km                           float64
kremlin_km                               float64
big_road1_km                             float64
ID_big_road1                               int64
big_road1_1line                           object
big_road2_km                             float64
ID_big_road2                               int64
railroad_km                              float64
railroad_1line                            object
zd_vokzaly_avto_km                       float64
ID_railroad_terminal                       int64
bus_terminal_avto_km                     float64
ID_bus_terminal                            int64
oil_chemistry_km                         float64
nuclear_reactor_km                       float64
radiation_km                             float64
power_transmission_line_km               float64
thermal_power_plant_km                   float64
ts_km                                    float64
big_market_km                            float64
market_shop_km                           float64
fitness_km                               float64
swim_pool_km                             float64
ice_rink_km                              float64
stadium_km                               float64
basketball_km                            float64
hospice_morgue_km                        float64
detention_facility_km                    float64
public_healthcare_km                     float64
university_km                            float64
workplaces_km                            float64
shopping_centers_km                      float64
office_km                                float64
additional_education_km                  float64
preschool_km                             float64
big_church_km                            float64
church_synagogue_km                      float64
mosque_km                                float64
theater_km                               float64
museum_km                                float64
exhibition_km                            float64
catering_km                              float64
ecology                                   object
green_part_500                           float64
prom_part_500                            float64
office_count_500                           int64
office_sqm_500                             int64
trc_count_500                              int64
trc_sqm_500                                int64
cafe_count_500                             int64
cafe_sum_500_min_price_avg               float64
cafe_sum_500_max_price_avg               float64
cafe_avg_price_500                       float64
cafe_count_500_na_price                    int64
cafe_count_500_price_500                   int64
cafe_count_500_price_1000                  int64
cafe_count_500_price_1500                  int64
cafe_count_500_price_2500                  int64
cafe_count_500_price_4000                  int64
cafe_count_500_price_high                  int64
big_church_count_500                       int64
church_count_500                           int64
mosque_count_500                           int64
leisure_count_500                          int64
sport_count_500                            int64
market_count_500                           int64
green_part_1000                          float64
prom_part_1000                           float64
office_count_1000                          int64
office_sqm_1000                            int64
trc_count_1000                             int64
trc_sqm_1000                               int64
cafe_count_1000                            int64
cafe_sum_1000_min_price_avg              float64
cafe_sum_1000_max_price_avg              float64
cafe_avg_price_1000                      float64
cafe_count_1000_na_price                   int64
cafe_count_1000_price_500                  int64
cafe_count_1000_price_1000                 int64
cafe_count_1000_price_1500                 int64
cafe_count_1000_price_2500                 int64
cafe_count_1000_price_4000                 int64
cafe_count_1000_price_high                 int64
big_church_count_1000                      int64
church_count_1000                          int64
mosque_count_1000                          int64
leisure_count_1000                         int64
sport_count_1000                           int64
market_count_1000                          int64
green_part_1500                          float64
prom_part_1500                           float64
office_count_1500                          int64
office_sqm_1500                            int64
trc_count_1500                             int64
trc_sqm_1500                               int64
cafe_count_1500                            int64
cafe_sum_1500_min_price_avg              float64
cafe_sum_1500_max_price_avg              float64
cafe_avg_price_1500                      float64
cafe_count_1500_na_price                   int64
cafe_count_1500_price_500                  int64
cafe_count_1500_price_1000                 int64
cafe_count_1500_price_1500                 int64
cafe_count_1500_price_2500                 int64
cafe_count_1500_price_4000                 int64
cafe_count_1500_price_high                 int64
big_church_count_1500                      int64
church_count_1500                          int64
mosque_count_1500                          int64
leisure_count_1500                         int64
sport_count_1500                           int64
market_count_1500                          int64
green_part_2000                          float64
prom_part_2000                           float64
office_count_2000                          int64
office_sqm_2000                            int64
trc_count_2000                             int64
trc_sqm_2000                               int64
cafe_count_2000                            int64
cafe_sum_2000_min_price_avg              float64
cafe_sum_2000_max_price_avg              float64
cafe_avg_price_2000                      float64
cafe_count_2000_na_price                   int64
cafe_count_2000_price_500                  int64
cafe_count_2000_price_1000                 int64
cafe_count_2000_price_1500                 int64
cafe_count_2000_price_2500                 int64
cafe_count_2000_price_4000                 int64
cafe_count_2000_price_high                 int64
big_church_count_2000                      int64
church_count_2000                          int64
mosque_count_2000                          int64
leisure_count_2000                         int64
sport_count_2000                           int64
market_count_2000                          int64
green_part_3000                          float64
prom_part_3000                           float64
office_count_3000                          int64
office_sqm_3000                            int64
trc_count_3000                             int64
trc_sqm_3000                               int64
cafe_count_3000                            int64
cafe_sum_3000_min_price_avg              float64
cafe_sum_3000_max_price_avg              float64
cafe_avg_price_3000                      float64
cafe_count_3000_na_price                   int64
cafe_count_3000_price_500                  int64
cafe_count_3000_price_1000                 int64
cafe_count_3000_price_1500                 int64
cafe_count_3000_price_2500                 int64
cafe_count_3000_price_4000                 int64
cafe_count_3000_price_high                 int64
big_church_count_3000                      int64
church_count_3000                          int64
mosque_count_3000                          int64
leisure_count_3000                         int64
sport_count_3000                           int64
market_count_3000                          int64
green_part_5000                          float64
prom_part_5000                           float64
office_count_5000                          int64
office_sqm_5000                            int64
trc_count_5000                             int64
trc_sqm_5000                               int64
cafe_count_5000                            int64
cafe_sum_5000_min_price_avg              float64
cafe_sum_5000_max_price_avg              float64
cafe_avg_price_5000                      float64
cafe_count_5000_na_price                   int64
cafe_count_5000_price_500                  int64
cafe_count_5000_price_1000                 int64
cafe_count_5000_price_1500                 int64
cafe_count_5000_price_2500                 int64
cafe_count_5000_price_4000                 int64
cafe_count_5000_price_high                 int64
big_church_count_5000                      int64
church_count_5000                          int64
mosque_count_5000                          int64
leisure_count_5000                         int64
sport_count_5000                           int64
market_count_5000                          int64
year                                       int64
month                                      int64
day                                        int64
dtype: object
In [39]:
train_data.shape
Out[39]:
(30471, 294)
In [40]:
test_data.shape
Out[40]:
(7662, 293)

Handling Categorical NA values

In [41]:
features_nan=[feature for feature in train_data.columns if train_data[feature].isnull().sum()>1 and train_data[feature].dtypes=='O']

for feature in features_nan:
    print("{}: {}% missing values".format(feature,np.round(train_data[feature].isnull().mean(),4)))
In [42]:
features_nan
Out[42]:
[]

Handling Numerical NA values

In [43]:
numerical_with_nan=[feature for feature in train_data.columns if train_data[feature].isnull().sum()>1 and train_data[feature].dtypes!='O']

for feature in numerical_with_nan:
    print("{}: {}% missing value".format(feature,np.around(train_data[feature].isnull().mean(),4)))
life_sq: 0.2095% missing value
floor: 0.0055% missing value
max_floor: 0.3141% missing value
material: 0.3141% missing value
build_year: 0.4465% missing value
num_room: 0.3141% missing value
kitch_sq: 0.3141% missing value
state: 0.445% missing value
preschool_quota: 0.2195% missing value
school_quota: 0.2194% missing value
hospital_beds_raion: 0.4739% missing value
raion_build_count_with_material_info: 0.1638% missing value
build_count_block: 0.1638% missing value
build_count_wood: 0.1638% missing value
build_count_frame: 0.1638% missing value
build_count_brick: 0.1638% missing value
build_count_monolith: 0.1638% missing value
build_count_panel: 0.1638% missing value
build_count_foam: 0.1638% missing value
build_count_slag: 0.1638% missing value
build_count_mix: 0.1638% missing value
raion_build_count_with_builddate_info: 0.1638% missing value
build_count_before_1920: 0.1638% missing value
build_count_1921-1945: 0.1638% missing value
build_count_1946-1970: 0.1638% missing value
build_count_1971-1995: 0.1638% missing value
build_count_after_1995: 0.1638% missing value
metro_min_walk: 0.0008% missing value
metro_km_walk: 0.0008% missing value
railroad_station_walk_km: 0.0008% missing value
railroad_station_walk_min: 0.0008% missing value
ID_railroad_station_walk: 0.0008% missing value
cafe_sum_500_min_price_avg: 0.4359% missing value
cafe_sum_500_max_price_avg: 0.4359% missing value
cafe_avg_price_500: 0.4359% missing value
cafe_sum_1000_min_price_avg: 0.2141% missing value
cafe_sum_1000_max_price_avg: 0.2141% missing value
cafe_avg_price_1000: 0.2141% missing value
cafe_sum_1500_min_price_avg: 0.1378% missing value
cafe_sum_1500_max_price_avg: 0.1378% missing value
cafe_avg_price_1500: 0.1378% missing value
cafe_sum_2000_min_price_avg: 0.0566% missing value
cafe_sum_2000_max_price_avg: 0.0566% missing value
cafe_avg_price_2000: 0.0566% missing value
cafe_sum_3000_min_price_avg: 0.0325% missing value
cafe_sum_3000_max_price_avg: 0.0325% missing value
cafe_avg_price_3000: 0.0325% missing value
prom_part_5000: 0.0058% missing value
cafe_sum_5000_min_price_avg: 0.0097% missing value
cafe_sum_5000_max_price_avg: 0.0097% missing value
cafe_avg_price_5000: 0.0097% missing value
In [44]:
for feature in numerical_with_nan:
    ## We will replace by using median since there are outliers
    median_value=train_data[feature].median()
    
    ## create a new feature to capture nan values
    train_data[feature+'nan']=np.where(train_data[feature].isnull(),1,0)
    train_data[feature].fillna(median_value,inplace=True)
    
train_data[numerical_with_nan].isnull().sum()
Out[44]:
life_sq                                  0
floor                                    0
max_floor                                0
material                                 0
build_year                               0
num_room                                 0
kitch_sq                                 0
state                                    0
preschool_quota                          0
school_quota                             0
hospital_beds_raion                      0
raion_build_count_with_material_info     0
build_count_block                        0
build_count_wood                         0
build_count_frame                        0
build_count_brick                        0
build_count_monolith                     0
build_count_panel                        0
build_count_foam                         0
build_count_slag                         0
build_count_mix                          0
raion_build_count_with_builddate_info    0
build_count_before_1920                  0
build_count_1921-1945                    0
build_count_1946-1970                    0
build_count_1971-1995                    0
build_count_after_1995                   0
metro_min_walk                           0
metro_km_walk                            0
railroad_station_walk_km                 0
railroad_station_walk_min                0
ID_railroad_station_walk                 0
cafe_sum_500_min_price_avg               0
cafe_sum_500_max_price_avg               0
cafe_avg_price_500                       0
cafe_sum_1000_min_price_avg              0
cafe_sum_1000_max_price_avg              0
cafe_avg_price_1000                      0
cafe_sum_1500_min_price_avg              0
cafe_sum_1500_max_price_avg              0
cafe_avg_price_1500                      0
cafe_sum_2000_min_price_avg              0
cafe_sum_2000_max_price_avg              0
cafe_avg_price_2000                      0
cafe_sum_3000_min_price_avg              0
cafe_sum_3000_max_price_avg              0
cafe_avg_price_3000                      0
prom_part_5000                           0
cafe_sum_5000_min_price_avg              0
cafe_sum_5000_max_price_avg              0
cafe_avg_price_5000                      0
dtype: int64
In [45]:
train_data.dtypes
Out[45]:
id                                  int64
full_sq                             int64
life_sq                           float64
floor                             float64
max_floor                         float64
                                   ...   
cafe_avg_price_3000nan              int32
prom_part_5000nan                   int32
cafe_sum_5000_min_price_avgnan      int32
cafe_sum_5000_max_price_avgnan      int32
cafe_avg_price_5000nan              int32
Length: 345, dtype: object
In [46]:
features_nan_test=[feature for feature in test_data.columns if test_data[feature].isnull().sum()>1 and test_data[feature].dtypes=='O']

for feature in features_nan_test:
    print("{}: {}% missing values".format(feature,np.round(test_data[feature].isnull().mean(),4)))
product_type: 0.0043% missing values
In [47]:
def replace_cat_feature_test(test_data,features_nan_test):
    data=test_data.copy()
    data[features_nan_test]=data[features_nan_test].fillna('Missing')
    return data

test_data=replace_cat_feature_test(test_data,features_nan_test)

test_data[features_nan_test].isnull().sum()
Out[47]:
product_type    0
dtype: int64
In [48]:
numerical_with_nan_test=[feature for feature in test_data.columns if test_data[feature].isnull().sum()>1 and test_data[feature].dtypes!='O']

for feature in numerical_with_nan_test:
    print("{}: {}% missing value".format(feature,np.around(test_data[feature].isnull().mean(),4)))
life_sq: 0.1535% missing value
build_year: 0.1369% missing value
state: 0.0906% missing value
preschool_quota: 0.2083% missing value
school_quota: 0.2082% missing value
hospital_beds_raion: 0.4461% missing value
raion_build_count_with_material_info: 0.159% missing value
build_count_block: 0.159% missing value
build_count_wood: 0.159% missing value
build_count_frame: 0.159% missing value
build_count_brick: 0.159% missing value
build_count_monolith: 0.159% missing value
build_count_panel: 0.159% missing value
build_count_foam: 0.159% missing value
build_count_slag: 0.159% missing value
build_count_mix: 0.159% missing value
raion_build_count_with_builddate_info: 0.159% missing value
build_count_before_1920: 0.159% missing value
build_count_1921-1945: 0.159% missing value
build_count_1946-1970: 0.159% missing value
build_count_1971-1995: 0.159% missing value
build_count_after_1995: 0.159% missing value
metro_min_walk: 0.0044% missing value
metro_km_walk: 0.0044% missing value
railroad_station_walk_km: 0.0044% missing value
railroad_station_walk_min: 0.0044% missing value
ID_railroad_station_walk: 0.0044% missing value
cafe_sum_500_min_price_avg: 0.4123% missing value
cafe_sum_500_max_price_avg: 0.4123% missing value
cafe_avg_price_500: 0.4123% missing value
cafe_sum_1000_min_price_avg: 0.1595% missing value
cafe_sum_1000_max_price_avg: 0.1595% missing value
cafe_avg_price_1000: 0.1595% missing value
cafe_sum_1500_min_price_avg: 0.1072% missing value
cafe_sum_1500_max_price_avg: 0.1072% missing value
cafe_avg_price_1500: 0.1072% missing value
green_part_2000: 0.0025% missing value
cafe_sum_2000_min_price_avg: 0.0553% missing value
cafe_sum_2000_max_price_avg: 0.0553% missing value
cafe_avg_price_2000: 0.0553% missing value
cafe_sum_3000_min_price_avg: 0.0238% missing value
cafe_sum_3000_max_price_avg: 0.0238% missing value
cafe_avg_price_3000: 0.0238% missing value
prom_part_5000: 0.012% missing value
cafe_sum_5000_min_price_avg: 0.0167% missing value
cafe_sum_5000_max_price_avg: 0.0167% missing value
cafe_avg_price_5000: 0.0167% missing value
In [49]:
for feature in numerical_with_nan_test:
    ## We will replace by using median since there are outliers
    median_value=test_data[feature].median()
    
    ## create a new feature to capture nan values
    test_data[feature+'nan']=np.where(test_data[feature].isnull(),1,0)
    test_data[feature].fillna(median_value,inplace=True)
    
test_data[numerical_with_nan_test].isnull().sum()
Out[49]:
life_sq                                  0
build_year                               0
state                                    0
preschool_quota                          0
school_quota                             0
hospital_beds_raion                      0
raion_build_count_with_material_info     0
build_count_block                        0
build_count_wood                         0
build_count_frame                        0
build_count_brick                        0
build_count_monolith                     0
build_count_panel                        0
build_count_foam                         0
build_count_slag                         0
build_count_mix                          0
raion_build_count_with_builddate_info    0
build_count_before_1920                  0
build_count_1921-1945                    0
build_count_1946-1970                    0
build_count_1971-1995                    0
build_count_after_1995                   0
metro_min_walk                           0
metro_km_walk                            0
railroad_station_walk_km                 0
railroad_station_walk_min                0
ID_railroad_station_walk                 0
cafe_sum_500_min_price_avg               0
cafe_sum_500_max_price_avg               0
cafe_avg_price_500                       0
cafe_sum_1000_min_price_avg              0
cafe_sum_1000_max_price_avg              0
cafe_avg_price_1000                      0
cafe_sum_1500_min_price_avg              0
cafe_sum_1500_max_price_avg              0
cafe_avg_price_1500                      0
green_part_2000                          0
cafe_sum_2000_min_price_avg              0
cafe_sum_2000_max_price_avg              0
cafe_avg_price_2000                      0
cafe_sum_3000_min_price_avg              0
cafe_sum_3000_max_price_avg              0
cafe_avg_price_3000                      0
prom_part_5000                           0
cafe_sum_5000_min_price_avg              0
cafe_sum_5000_max_price_avg              0
cafe_avg_price_5000                      0
dtype: int64
In [50]:
test_data.shape
Out[50]:
(7662, 340)
In [51]:
train_data.shape
Out[51]:
(30471, 345)
In [52]:
numerical_features_test= [feature for feature in test_data.columns if test_data[feature].dtypes != 'O']

print('Number of numerical variables: ', len(numerical_features_test))

test_data[numerical_features_test].head()
Number of numerical variables:  325
Out[52]:
id full_sq life_sq floor max_floor material build_year num_room kitch_sq state area_m raion_popul green_zone_part indust_part children_preschool preschool_quota preschool_education_centers_raion children_school school_quota school_education_centers_raion school_education_centers_top_20_raion hospital_beds_raion healthcare_centers_raion university_top_20_raion sport_objects_raion additional_education_raion culture_objects_top_25_raion shopping_centers_raion office_raion full_all male_f female_f young_all young_male young_female work_all work_male work_female ekder_all ekder_male ekder_female 0_6_all 0_6_male 0_6_female 7_14_all 7_14_male 7_14_female 0_17_all 0_17_male 0_17_female 16_29_all 16_29_male 16_29_female 0_13_all 0_13_male 0_13_female raion_build_count_with_material_info build_count_block build_count_wood build_count_frame build_count_brick build_count_monolith build_count_panel build_count_foam build_count_slag build_count_mix raion_build_count_with_builddate_info build_count_before_1920 build_count_1921-1945 build_count_1946-1970 build_count_1971-1995 build_count_after_1995 ID_metro metro_min_avto metro_km_avto metro_min_walk metro_km_walk kindergarten_km school_km park_km green_zone_km industrial_km water_treatment_km cemetery_km incineration_km railroad_station_walk_km railroad_station_walk_min ID_railroad_station_walk railroad_station_avto_km railroad_station_avto_min ID_railroad_station_avto public_transport_station_km public_transport_station_min_walk water_km mkad_km ttk_km sadovoe_km bulvar_ring_km kremlin_km big_road1_km ID_big_road1 big_road2_km ID_big_road2 railroad_km zd_vokzaly_avto_km ID_railroad_terminal bus_terminal_avto_km ID_bus_terminal oil_chemistry_km nuclear_reactor_km radiation_km power_transmission_line_km thermal_power_plant_km ts_km big_market_km market_shop_km fitness_km swim_pool_km ice_rink_km stadium_km basketball_km hospice_morgue_km detention_facility_km public_healthcare_km university_km workplaces_km shopping_centers_km office_km additional_education_km preschool_km big_church_km church_synagogue_km mosque_km theater_km museum_km exhibition_km catering_km green_part_500 prom_part_500 office_count_500 office_sqm_500 trc_count_500 trc_sqm_500 cafe_count_500 cafe_sum_500_min_price_avg cafe_sum_500_max_price_avg cafe_avg_price_500 cafe_count_500_na_price cafe_count_500_price_500 cafe_count_500_price_1000 ... cafe_count_1000_price_4000 cafe_count_1000_price_high big_church_count_1000 church_count_1000 mosque_count_1000 leisure_count_1000 sport_count_1000 market_count_1000 green_part_1500 prom_part_1500 office_count_1500 office_sqm_1500 trc_count_1500 trc_sqm_1500 cafe_count_1500 cafe_sum_1500_min_price_avg cafe_sum_1500_max_price_avg cafe_avg_price_1500 cafe_count_1500_na_price cafe_count_1500_price_500 cafe_count_1500_price_1000 cafe_count_1500_price_1500 cafe_count_1500_price_2500 cafe_count_1500_price_4000 cafe_count_1500_price_high big_church_count_1500 church_count_1500 mosque_count_1500 leisure_count_1500 sport_count_1500 market_count_1500 green_part_2000 prom_part_2000 office_count_2000 office_sqm_2000 trc_count_2000 trc_sqm_2000 cafe_count_2000 cafe_sum_2000_min_price_avg cafe_sum_2000_max_price_avg cafe_avg_price_2000 cafe_count_2000_na_price cafe_count_2000_price_500 cafe_count_2000_price_1000 cafe_count_2000_price_1500 cafe_count_2000_price_2500 cafe_count_2000_price_4000 cafe_count_2000_price_high big_church_count_2000 church_count_2000 mosque_count_2000 leisure_count_2000 sport_count_2000 market_count_2000 green_part_3000 prom_part_3000 office_count_3000 office_sqm_3000 trc_count_3000 trc_sqm_3000 cafe_count_3000 cafe_sum_3000_min_price_avg cafe_sum_3000_max_price_avg cafe_avg_price_3000 cafe_count_3000_na_price cafe_count_3000_price_500 cafe_count_3000_price_1000 cafe_count_3000_price_1500 cafe_count_3000_price_2500 cafe_count_3000_price_4000 cafe_count_3000_price_high big_church_count_3000 church_count_3000 mosque_count_3000 leisure_count_3000 sport_count_3000 market_count_3000 green_part_5000 prom_part_5000 office_count_5000 office_sqm_5000 trc_count_5000 trc_sqm_5000 cafe_count_5000 cafe_sum_5000_min_price_avg cafe_sum_5000_max_price_avg cafe_avg_price_5000 cafe_count_5000_na_price cafe_count_5000_price_500 cafe_count_5000_price_1000 cafe_count_5000_price_1500 cafe_count_5000_price_2500 cafe_count_5000_price_4000 cafe_count_5000_price_high big_church_count_5000 church_count_5000 mosque_count_5000 leisure_count_5000 sport_count_5000 market_count_5000 year month day life_sqnan build_yearnan statenan preschool_quotanan school_quotanan hospital_beds_raionnan raion_build_count_with_material_infonan build_count_blocknan build_count_woodnan build_count_framenan build_count_bricknan build_count_monolithnan build_count_panelnan build_count_foamnan build_count_slagnan build_count_mixnan raion_build_count_with_builddate_infonan build_count_before_1920nan build_count_1921-1945nan build_count_1946-1970nan build_count_1971-1995nan build_count_after_1995nan metro_min_walknan metro_km_walknan railroad_station_walk_kmnan railroad_station_walk_minnan ID_railroad_station_walknan cafe_sum_500_min_price_avgnan cafe_sum_500_max_price_avgnan cafe_avg_price_500nan cafe_sum_1000_min_price_avgnan cafe_sum_1000_max_price_avgnan cafe_avg_price_1000nan cafe_sum_1500_min_price_avgnan cafe_sum_1500_max_price_avgnan cafe_avg_price_1500nan green_part_2000nan cafe_sum_2000_min_price_avgnan cafe_sum_2000_max_price_avgnan cafe_avg_price_2000nan cafe_sum_3000_min_price_avgnan cafe_sum_3000_max_price_avgnan cafe_avg_price_3000nan prom_part_5000nan cafe_sum_5000_min_price_avgnan cafe_sum_5000_max_price_avgnan cafe_avg_price_5000nan
0 30474 39.0 20.7 2 9 1 1998.0 1 8.9 3.0 2.615514e+07 178264 0.137846 0.041116 14080 11926.0 11 14892 24750.0 13 1 990.0 1 0 13 4 0 4 4 102618 47681 54937 30808 16251 14557 121369 59138 62231 26087 7410 18677 14080 7457 6623 14892 7839 7053 34341 18094 16247 19906 9676 10230 27123 14340 12783 1681.0 173.0 607.0 19.0 245.0 116.0 431.0 1.0 84.0 5.0 1680.0 34.0 299.0 439.0 109.0 799.0 45 1.258957 0.735908 8.830901 0.735908 0.078502 0.746962 2.048487 0.061485 1.205404 0.967696 0.781053 10.56540 4.812102 57.745220 39.0 4.850748 6.274963 39 0.114134 1.369603 0.248151 6.374826 19.651101 22.790985 24.079707 24.779082 4.152246 2 5.706484 38 0.490549 27.553486 32 8.424959 9 22.624362 16.224083 6.620081 4.121874 8.957780 8.824060 15.483912 5.353674 0.225788 3.673942 11.810839 20.392427 9.131977 3.300120 25.462741 1.613152 17.214870 7.922610 2.414138 4.923614 0.514211 0.746962 0.749142 0.848297 1.917736 19.953413 14.052207 12.228576 0.446324 42.22 0.00 0 0 0 0 1 1000.0 1500.00 1250.00 0 0 0 ... 0 0 1 1 0 0 3 0 20.14 0.70 0 0 0 0 2 1000.00 1500.00 1250.00 0 0 0 2 0 0 0 1 2 0 0 4 0 15.17 1.18 0 0 0 0 3 1000.00 1500.00 1250.00 0 0 0 3 0 0 0 1 2 1 0 5 0 14.69 2.87 0 0 3 73000 12 781.82 1227.27 1004.55 1 2 2 7 0 0 0 1 3 1 0 7 0 21.58 4.69 1 37550 8 299166 19 676.47 1088.24 882.35 2 5 4 8 0 0 0 1 10 1 0 14 1 2015 7 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1 30475 79.2 30.4 8 17 1 0.0 3 1.0 1.0 2.553630e+07 4001 0.496315 0.007122 275 3062.0 0 264 6974.0 0 0 990.0 0 0 0 0 0 1 0 17790 8350 9443 574 297 277 2566 1356 1211 861 244 617 275 143 133 264 136 128 646 336 311 3796 2035 1762 506 261 245 295.0 44.0 1.0 0.0 84.0 7.0 92.0 0.0 0.0 0.0 295.0 0.0 2.0 144.0 73.0 31.0 21 4.230425 3.444625 41.335498 3.444625 1.192193 1.332570 4.400427 0.000000 0.742377 16.049420 2.244906 18.50054 5.458057 65.496687 24.0 5.458057 6.859956 24 0.826083 9.912993 0.799853 6.847813 16.975793 19.692960 20.864427 21.722620 2.148398 13 4.410488 27 2.342346 27.421853 50 16.913175 8 29.425443 17.080113 8.545593 4.932827 10.039833 3.654955 15.092542 8.156185 1.313180 4.244082 4.438242 13.445121 8.332180 7.095895 26.807204 3.775300 12.440198 9.672779 1.764298 3.764819 1.694967 1.332570 1.672126 1.162371 12.239901 13.006107 9.661063 4.323941 0.705873 86.33 0.00 0 0 0 0 0 700.0 1166.67 927.78 0 0 0 ... 0 0 0 0 0 0 0 0 59.28 5.31 0 0 0 0 3 833.33 1500.00 1166.67 0 0 2 0 1 0 0 0 1 0 0 1 0 49.26 4.06 0 0 1 5000 7 757.14 1285.71 1021.43 0 1 3 2 1 0 0 1 2 0 0 1 0 39.50 3.32 0 0 2 22000 10 680.00 1200.00 940.00 0 1 6 2 1 0 0 1 5 0 0 7 0 39.10 7.70 2 177300 6 231300 20 733.33 1250.00 991.67 2 4 8 4 1 1 0 2 11 0 1 12 1 2015 7 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2 30476 40.5 25.1 3 5 2 1960.0 2 4.8 2.0 9.946335e+06 139322 0.065409 0.225825 6400 2232.0 7 6558 7966.0 7 0 1548.0 3 0 13 0 0 2 7 36154 16222 19932 13799 6937 6862 91795 44734 47061 33728 9653 24075 6400 3209 3191 6558 3317 3241 15514 7813 7701 8137 3787 4350 12162 6117 6045 561.0 111.0 0.0 0.0 254.0 3.0 189.0 0.0 4.0 0.0 561.0 0.0 5.0 437.0 79.0 40.0 44 1.585306 1.122214 13.466563 1.122214 0.065324 0.194608 2.513006 0.580638 0.900408 11.749900 3.389848 10.19563 3.628293 43.539514 68.0 3.977659 5.375048 59 0.116686 1.400229 1.384824 3.499380 5.627481 8.090528 8.671086 10.320728 0.580638 10 3.499380 1 2.220941 10.093318 5 7.921607 3 1.823381 14.431252 0.826743 2.388288 3.760642 3.290966 16.304596 0.644830 0.966254 1.332737 3.131143 1.464174 1.499581 0.487817 6.718082 0.711768 4.862872 3.506298 1.456661 1.223804 2.330995 0.194608 1.400094 1.177527 9.938735 2.983875 1.988346 0.794245 0.320864 0.00 0.00 0 0 0 0 3 400.0 750.00 575.00 1 1 1 ... 0 0 0 0 0 1 6 2 30.97 8.75 2 34100 1 0 19 655.56 1111.11 883.33 1 6 6 4 2 0 0 1 1 0 1 12 3 40.90 10.51 6 80237 3 14090 28 633.33 1092.59 862.96 1 7 12 6 2 0 0 2 2 0 4 14 4 45.86 9.08 8 215237 6 39106 37 608.33 1069.44 838.89 1 8 19 7 2 0 0 2 3 0 5 22 4 25.62 13.59 27 427889 26 1024431 179 668.97 1132.18 900.57 5 53 64 42 11 4 0 10 21 0 10 71 11 2015 7 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3 30477 62.8 36.0 17 17 1 2016.0 2 62.8 3.0 2.149409e+07 7122 0.262459 0.017647 489 3062.0 0 469 6974.0 0 0 990.0 0 0 0 2 0 0 0 9553 4529 5024 1021 529 493 4568 2414 2155 1533 435 1099 489 254 236 469 242 228 1150 597 553 2155 1206 950 900 465 435 295.0 44.0 1.0 0.0 84.0 7.0 92.0 0.0 0.0 0.0 295.0 0.0 2.0 144.0 73.0 31.0 45 7.931398 6.038848 68.559794 5.713316 3.189083 3.540105 5.612835 0.025446 0.466738 5.061917 2.701804 14.62944 10.284167 123.410001 39.0 10.609698 13.517419 39 3.093209 37.118504 0.233017 8.928836 22.094252 25.062928 26.226045 26.960463 2.722667 38 8.601110 2 4.476081 37.436772 50 13.979650 8 26.895118 19.942295 9.434351 6.218331 13.345715 10.480798 10.723870 11.112624 4.480234 8.577223 15.200509 18.560234 12.253021 6.831966 30.366022 5.731266 15.382678 11.306566 6.589381 8.102094 0.403429 3.540105 5.411312 0.213853 6.153091 18.121220 16.938290 14.171229 0.454087 22.01 0.15 0 0 0 0 1 300.0 500.00 400.00 0 1 0 ... 0 0 0 2 0 0 0 0 28.82 3.59 0 0 0 0 1 300.00 500.00 400.00 0 1 0 0 0 0 0 0 3 0 0 0 0 31.35 2.99 0 0 0 0 1 300.00 500.00 400.00 0 1 0 0 0 0 0 0 3 0 0 0 0 34.87 1.34 0 0 0 0 1 300.00 500.00 400.00 0 1 0 0 0 0 0 0 4 0 0 0 0 24.25 1.66 0 0 0 0 5 1560.00 2500.00 2030.00 0 1 0 1 1 2 0 0 10 0 0 2 0 2015 7 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
4 30478 40.0 40.0 17 17 1 0.0 1 1.0 1.0 2.553630e+07 4001 0.496315 0.007122 275 3062.0 0 264 6974.0 0 0 990.0 0 0 0 0 0 1 0 17790 8350 9443 574 297 277 2566 1356 1211 861 244 617 275 143 133 264 136 128 646 336 311 3796 2035 1762 506 261 245 295.0 44.0 1.0 0.0 84.0 7.0 92.0 0.0 0.0 0.0 295.0 0.0 2.0 144.0 73.0 31.0 21 2.152792 1.722233 20.666800 1.722233 0.897889 1.234235 4.566595 0.427248 0.353642 16.784630 2.250137 19.14919 3.735666 44.827989 24.0 3.735666 4.782323 24 0.630014 7.560163 0.394422 7.123215 17.148737 19.868997 21.038561 21.905792 2.808077 13 3.688405 27 1.727223 25.699461 50 17.366661 8 29.968660 17.397666 9.036942 5.506770 10.102328 3.729416 15.546028 6.433794 1.519553 2.521691 2.715850 13.898607 8.355285 7.401423 25.084813 2.052908 12.893684 9.479093 1.806570 4.338453 1.339078 1.234235 1.192543 1.186621 12.652956 13.459593 9.890758 4.555385 0.066503 3.33 3.70 0 0 0 0 2 1000.0 1750.00 1375.00 0 0 1 ... 0 0 0 0 0 0 0 0 43.85 1.55 0 0 0 0 3 833.33 1500.00 1166.67 0 0 2 0 1 0 0 1 1 0 0 0 0 38.61 3.12 0 0 2 22000 7 757.14 1285.71 1021.43 0 1 3 2 1 0 0 1 2 0 0 3 0 41.64 2.11 0 0 2 22000 9 700.00 1222.22 961.11 0 1 5 2 1 0 0 1 4 0 0 6 0 35.62 6.96 1 117300 4 201300 20 747.37 1263.16 1005.26 1 4 8 5 1 1 0 2 12 0 1 11 1 2015 7 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

5 rows × 325 columns

In [53]:
year_feature_test= [feature for feature in numerical_features if 'Yr' in feature or 'Year' in feature]

year_feature_test
Out[53]:
[]
In [54]:
discrete_feature_test=[feature for feature in numerical_features_test if len(test_data[feature].unique())<25 and feature not in year_feature_test+['id']]
print("Discrete Variables Count: {}".format(len(discrete_feature_test)))
Discrete Variables Count: 101
In [55]:
continuous_feature_test=[feature for feature in numerical_features_test if feature not in discrete_feature_test+year_feature_test+['Id']]
print("Continuous feature Count {}".format(len(continuous_feature_test)))
Continuous feature Count 224
In [56]:
categorical_features_test=[feature for feature in test_data.columns if test_data[feature].dtypes=='O']
len(categorical_features_test)
Out[56]:
15
In [57]:
for feature in categorical_features_test:
    print('The feature is {} and number of categories are {}'.format(feature,len(test_data[feature].unique())))
The feature is product_type and number of categories are 3
The feature is sub_area and number of categories are 145
The feature is culture_objects_top_25 and number of categories are 2
The feature is thermal_power_plant_raion and number of categories are 2
The feature is incineration_raion and number of categories are 2
The feature is oil_chemistry_raion and number of categories are 2
The feature is radiation_raion and number of categories are 2
The feature is railroad_terminal_raion and number of categories are 2
The feature is big_market_raion and number of categories are 2
The feature is nuclear_reactor_raion and number of categories are 2
The feature is detention_facility_raion and number of categories are 2
The feature is water_1line and number of categories are 2
The feature is big_road1_1line and number of categories are 2
The feature is railroad_1line and number of categories are 2
The feature is ecology and number of categories are 5

RARE VARIABLE

In [58]:
for feature in categorical_features_test:
    test_data[feature] = test_data[feature].map((train_data.groupby(feature)['price_doc'].count()/len(train_data)).to_dict())
    #train_data[feature] = train_data[feature].map((train_data.groupby(feature)['SalePrice'].count()/len(train_data)).to_dict())
In [59]:
categorical_features=[feature for feature in train_data.columns if train_data[feature].dtypes=='O' if feature not in ["timestamp"]]
len(categorical_features)
Out[59]:
15
In [60]:
for feature in categorical_features:
    temp=train_data.groupby(feature)['price_doc'].count()/len(train_data)
    temp_df=temp[temp>0.01].index
    train_data[feature]=np.where(train_data[feature].isin(temp_df),train_data[feature],'Rare_var')
In [61]:
for feature in categorical_features_test:
    temp=test_data[feature]
    temp_df=temp[temp>0.01].index
    test_data[feature]=np.where(test_data[feature].isin(temp_df),test_data[feature],'Rare_var')

TARGET ENCODING

In [62]:
for feature in categorical_features:
    labels_ordered=train_data.groupby([feature])['price_doc'].mean().sort_values().index
    labels_ordered={k:i for i,k in enumerate(labels_ordered,0)}
    train_data[feature]=train_data[feature].map(labels_ordered)
In [63]:
for feature in categorical_features_test:
    labels_ordered=test_data[feature]
    labels_ordered={k:i for i,k in enumerate(labels_ordered,0)}
    test_data[feature]=test_data[feature].map(labels_ordered)
In [64]:
train_data.head()
Out[64]:
id full_sq life_sq floor max_floor material build_year num_room kitch_sq state product_type sub_area area_m raion_popul green_zone_part indust_part children_preschool preschool_quota preschool_education_centers_raion children_school school_quota school_education_centers_raion school_education_centers_top_20_raion hospital_beds_raion healthcare_centers_raion university_top_20_raion sport_objects_raion additional_education_raion culture_objects_top_25 culture_objects_top_25_raion shopping_centers_raion office_raion thermal_power_plant_raion incineration_raion oil_chemistry_raion radiation_raion railroad_terminal_raion big_market_raion nuclear_reactor_raion detention_facility_raion full_all male_f female_f young_all young_male young_female work_all work_male work_female ekder_all ekder_male ekder_female 0_6_all 0_6_male 0_6_female 7_14_all 7_14_male 7_14_female 0_17_all 0_17_male 0_17_female 16_29_all 16_29_male 16_29_female 0_13_all 0_13_male 0_13_female raion_build_count_with_material_info build_count_block build_count_wood build_count_frame build_count_brick build_count_monolith build_count_panel build_count_foam build_count_slag build_count_mix raion_build_count_with_builddate_info build_count_before_1920 build_count_1921-1945 build_count_1946-1970 build_count_1971-1995 build_count_after_1995 ID_metro metro_min_avto metro_km_avto metro_min_walk metro_km_walk kindergarten_km school_km park_km green_zone_km industrial_km water_treatment_km cemetery_km incineration_km railroad_station_walk_km railroad_station_walk_min ID_railroad_station_walk railroad_station_avto_km railroad_station_avto_min ID_railroad_station_avto public_transport_station_km public_transport_station_min_walk water_km water_1line mkad_km ttk_km sadovoe_km bulvar_ring_km kremlin_km big_road1_km ID_big_road1 big_road1_1line big_road2_km ID_big_road2 railroad_km railroad_1line zd_vokzaly_avto_km ID_railroad_terminal bus_terminal_avto_km ID_bus_terminal oil_chemistry_km nuclear_reactor_km radiation_km power_transmission_line_km thermal_power_plant_km ts_km big_market_km market_shop_km fitness_km swim_pool_km ice_rink_km stadium_km basketball_km hospice_morgue_km detention_facility_km public_healthcare_km university_km workplaces_km shopping_centers_km office_km additional_education_km preschool_km big_church_km church_synagogue_km mosque_km theater_km museum_km exhibition_km ... leisure_count_1000 sport_count_1000 market_count_1000 green_part_1500 prom_part_1500 office_count_1500 office_sqm_1500 trc_count_1500 trc_sqm_1500 cafe_count_1500 cafe_sum_1500_min_price_avg cafe_sum_1500_max_price_avg cafe_avg_price_1500 cafe_count_1500_na_price cafe_count_1500_price_500 cafe_count_1500_price_1000 cafe_count_1500_price_1500 cafe_count_1500_price_2500 cafe_count_1500_price_4000 cafe_count_1500_price_high big_church_count_1500 church_count_1500 mosque_count_1500 leisure_count_1500 sport_count_1500 market_count_1500 green_part_2000 prom_part_2000 office_count_2000 office_sqm_2000 trc_count_2000 trc_sqm_2000 cafe_count_2000 cafe_sum_2000_min_price_avg cafe_sum_2000_max_price_avg cafe_avg_price_2000 cafe_count_2000_na_price cafe_count_2000_price_500 cafe_count_2000_price_1000 cafe_count_2000_price_1500 cafe_count_2000_price_2500 cafe_count_2000_price_4000 cafe_count_2000_price_high big_church_count_2000 church_count_2000 mosque_count_2000 leisure_count_2000 sport_count_2000 market_count_2000 green_part_3000 prom_part_3000 office_count_3000 office_sqm_3000 trc_count_3000 trc_sqm_3000 cafe_count_3000 cafe_sum_3000_min_price_avg cafe_sum_3000_max_price_avg cafe_avg_price_3000 cafe_count_3000_na_price cafe_count_3000_price_500 cafe_count_3000_price_1000 cafe_count_3000_price_1500 cafe_count_3000_price_2500 cafe_count_3000_price_4000 cafe_count_3000_price_high big_church_count_3000 church_count_3000 mosque_count_3000 leisure_count_3000 sport_count_3000 market_count_3000 green_part_5000 prom_part_5000 office_count_5000 office_sqm_5000 trc_count_5000 trc_sqm_5000 cafe_count_5000 cafe_sum_5000_min_price_avg cafe_sum_5000_max_price_avg cafe_avg_price_5000 cafe_count_5000_na_price cafe_count_5000_price_500 cafe_count_5000_price_1000 cafe_count_5000_price_1500 cafe_count_5000_price_2500 cafe_count_5000_price_4000 cafe_count_5000_price_high big_church_count_5000 church_count_5000 mosque_count_5000 leisure_count_5000 sport_count_5000 market_count_5000 price_doc year month day life_sqnan floornan max_floornan materialnan build_yearnan num_roomnan kitch_sqnan statenan preschool_quotanan school_quotanan hospital_beds_raionnan raion_build_count_with_material_infonan build_count_blocknan build_count_woodnan build_count_framenan build_count_bricknan build_count_monolithnan build_count_panelnan build_count_foamnan build_count_slagnan build_count_mixnan raion_build_count_with_builddate_infonan build_count_before_1920nan build_count_1921-1945nan build_count_1946-1970nan build_count_1971-1995nan build_count_after_1995nan metro_min_walknan metro_km_walknan railroad_station_walk_kmnan railroad_station_walk_minnan ID_railroad_station_walknan cafe_sum_500_min_price_avgnan cafe_sum_500_max_price_avgnan cafe_avg_price_500nan cafe_sum_1000_min_price_avgnan cafe_sum_1000_max_price_avgnan cafe_avg_price_1000nan cafe_sum_1500_min_price_avgnan cafe_sum_1500_max_price_avgnan cafe_avg_price_1500nan cafe_sum_2000_min_price_avgnan cafe_sum_2000_max_price_avgnan cafe_avg_price_2000nan cafe_sum_3000_min_price_avgnan cafe_sum_3000_max_price_avgnan cafe_avg_price_3000nan prom_part_5000nan cafe_sum_5000_min_price_avgnan cafe_sum_5000_max_price_avgnan cafe_avg_price_5000nan
0 1 43 27.0 4.0 12.0 1.0 1979.0 2.0 6.0 2.0 1 16 6.407578e+06 155572 0.189727 0.000070 9576 5001.0 5 10309 11065.0 5 0 240.0 1 0 7 3 0 0 16 1 0 1 1 0 0 1 0 0 86206 40477 45729 21154 11007 10147 98207 52277 45930 36211 10580 25631 9576 4899 4677 10309 5463 4846 23603 12286 11317 17508 9425 8083 18654 9709 8945 211.0 25.0 0.0 0.0 0.0 2.0 184.0 0.0 0.0 0.0 211.0 0.0 0.0 0.0 206.0 5.0 1 2.590241 1.131260 13.575119 1.131260 0.145700 0.177975 2.158587 0.600973 1.080934 23.683460 1.804127 3.633334 5.419893 65.038716 1.0 5.419893 6.905893 1 0.274985 3.299822 0.992631 1 1.422391 10.918587 13.100618 13.675657 15.156211 1.422391 1 0 3.830951 5 1.305159 1 14.231961 101 24.292406 1 18.152338 5.718519 1.210027 1.062513 5.814135 4.308127 10.814172 1.676258 0.485841 3.065047 1.107594 8.148591 3.516513 2.392353 4.248036 0.974743 6.715026 0.884350 0.648488 0.637189 0.947962 0.177975 0.625783 0.628187 3.932040 14.053047 7.389498 7.023705 ... 0 6 1 14.27 6.92 3 39554 9 171420 34 566.67 969.70 768.18 1 14 11 6 2 0 0 1 2 0 0 7 1 11.77 15.97 9 188854 19 1244891 36 614.29 1042.86 828.57 1 15 11 6 2 1 0 1 2 0 0 10 1 11.98 13.55 12 251554 23 1419204 68 639.68 1079.37 859.52 5 21 22 16 3 1 0 2 4 0 0 21 1 13.09 13.31 29 807385 52 4036616 152 708.57 1185.71 947.14 12 39 48 40 9 4 0 13 22 1 0 52 4 5850000 2011 8 20 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1 2 34 19.0 3.0 12.0 1.0 1979.0 2.0 6.0 2.0 1 17 9.589337e+06 115352 0.372602 0.049637 6880 3119.0 5 7759 6237.0 8 0 229.0 1 0 6 1 1 1 3 0 0 1 1 0 0 1 0 0 76284 34200 42084 15727 7925 7802 70194 35622 34572 29431 9266 20165 6880 3466 3414 7759 3909 3850 17700 8998 8702 15164 7571 7593 13729 6929 6800 245.0 83.0 1.0 0.0 67.0 4.0 90.0 0.0 0.0 0.0 244.0 1.0 1.0 143.0 84.0 15.0 2 0.936700 0.647337 7.620630 0.635053 0.147754 0.273345 0.550690 0.065321 0.966479 1.317476 4.655004 8.648587 3.411993 40.943917 2.0 3.641773 4.679745 2 0.065263 0.783160 0.698081 1 9.503405 3.103996 6.444333 8.132640 8.698054 2.887377 2 0 3.103996 4 0.694536 1 9.242586 32 5.706113 2 9.034642 3.489954 2.724295 1.246149 3.419574 0.725560 6.910568 3.424716 0.668364 2.000154 8.972823 6.127073 1.161579 2.543747 12.649879 1.477723 1.852560 0.686252 0.519311 0.688796 1.072315 0.273345 0.967821 0.471447 4.841544 6.829889 0.709260 2.358840 ... 4 2 0 21.53 7.71 3 102910 7 127065 17 694.12 1205.88 950.00 0 6 7 1 2 1 0 1 5 0 4 9 0 22.37 19.25 4 165510 8 179065 21 695.24 1190.48 942.86 0 7 8 3 2 1 0 1 5 0 4 11 0 18.07 27.32 12 821986 14 491565 30 631.03 1086.21 858.62 1 11 11 4 2 1 0 1 7 0 6 19 1 10.26 27.47 66 2690465 40 2034942 177 673.81 1148.81 911.31 9 49 65 36 15 3 0 15 29 1 10 66 14 6000000 2011 8 23 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2 3 43 29.0 2.0 12.0 1.0 1979.0 2.0 6.0 2.0 1 16 4.808270e+06 101708 0.112560 0.118537 5879 1463.0 4 6207 5580.0 7 0 1183.0 1 0 5 1 0 0 0 1 0 1 1 1 0 1 0 0 101982 46076 55906 13028 6835 6193 63388 31813 31575 25292 7609 17683 5879 3095 2784 6207 3269 2938 14884 7821 7063 19401 9045 10356 11252 5916 5336 330.0 59.0 0.0 0.0 206.0 4.0 60.0 0.0 1.0 0.0 330.0 1.0 0.0 246.0 63.0 20.0 3 2.120999 1.637996 17.351515 1.445960 0.049102 0.158072 0.374848 0.453172 0.939275 4.912660 3.381083 11.996480 1.277658 15.331896 3.0 1.277658 1.701420 3 0.328756 3.945073 0.468265 1 5.604800 2.927487 6.963403 8.054252 9.067885 0.647250 3 0 2.927487 4 0.700691 1 9.540544 5 6.710302 3 5.777394 7.506612 0.772216 1.602183 3.682455 3.562188 5.752368 1.375443 0.733101 1.239304 1.978517 0.767569 1.952771 0.621357 7.682303 0.097144 0.841254 1.510089 1.486533 1.543049 0.391957 0.158072 3.178751 0.755946 7.922152 4.273200 3.156423 4.958214 ... 0 5 3 9.92 6.73 0 0 1 2600 14 516.67 916.67 716.67 2 4 6 2 0 0 0 0 4 0 0 6 5 12.99 12.75 4 100200 7 52550 24 563.64 977.27 770.45 2 8 9 4 1 0 0 0 4 0 0 8 5 12.14 26.46 8 110856 7 52550 41 697.44 1192.31 944.87 2 9 17 9 3 1 0 0 11 0 0 20 6 13.69 21.58 43 1478160 35 1572990 122 702.68 1196.43 949.55 10 29 45 25 10 3 0 11 27 0 4 67 10 5700000 2011 8 27 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3 4 89 50.0 9.0 12.0 1.0 1979.0 2.0 6.0 2.0 1 18 1.258354e+07 178473 0.194703 0.069753 13087 6839.0 9 13670 17063.0 10 0 990.0 1 0 17 6 0 0 11 4 0 1 1 0 0 1 0 0 21155 9828 11327 28563 14680 13883 120381 60040 60341 29529 9083 20446 13087 6645 6442 13670 7126 6544 32063 16513 15550 3292 1450 1842 24934 12782 12152 458.0 9.0 51.0 12.0 124.0 50.0 201.0 0.0 9.0 2.0 459.0 13.0 24.0 40.0 130.0 252.0 4 1.489049 0.984537 11.565624 0.963802 0.179441 0.236455 0.078090 0.106125 0.451173 15.623710 2.017080 14.317640 4.291432 51.497190 4.0 3.816045 5.271136 4 0.131597 1.579164 1.200336 1 2.677824 14.606501 17.457198 18.309433 19.487005 2.677824 1 0 2.780449 17 1.999265 1 17.478380 83 6.734618 1 27.667863 9.522538 6.348716 1.767612 11.178333 0.583025 27.892717 0.811275 0.623484 1.950317 6.483172 7.385521 4.923843 3.549558 8.789894 2.163735 10.903161 0.622272 0.599914 0.934273 0.892674 0.236455 1.031777 1.561505 15.300449 16.990677 16.041521 5.029696 ... 0 3 1 28.38 6.57 2 11000 7 89492 23 673.91 1130.43 902.17 0 5 9 8 1 0 0 1 0 0 0 9 2 32.29 5.73 2 11000 7 89492 25 660.00 1120.00 890.00 0 5 11 8 1 0 0 1 1 0 0 13 2 20.79 3.57 4 167000 12 205756 32 718.75 1218.75 968.75 0 5 14 10 3 0 0 1 2 0 0 18 3 14.18 3.89 8 244166 22 942180 61 931.58 1552.63 1242.11 4 7 21 15 11 2 1 4 4 0 0 26 3 13100000 2011 9 1 0 0 1 1 1 1 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
4 5 77 77.0 4.0 12.0 1.0 1979.0 2.0 6.0 2.0 1 16 8.398461e+06 108171 0.015234 0.037316 5706 3240.0 7 6748 7770.0 9 0 562.0 4 2 25 2 0 0 10 93 0 1 1 1 1 1 0 0 28179 13522 14657 13368 7159 6209 68043 34236 33807 26760 8563 18197 5706 2982 2724 6748 3664 3084 15237 8113 7124 5164 2583 2581 11631 6223 5408 746.0 48.0 0.0 0.0 643.0 16.0 35.0 0.0 3.0 1.0 746.0 371.0 114.0 146.0 62.0 53.0 5 1.257186 0.876620 8.266305 0.688859 0.247901 0.376838 0.258289 0.236214 0.392871 10.683540 2.936581 11.903910 0.853960 10.247521 5.0 1.595898 2.156284 113 0.071480 0.857764 0.820294 1 11.616653 1.721834 0.046810 0.787593 2.578671 1.721834 4 0 3.133531 10 0.084113 0 1.595898 113 1.423428 4 6.515857 8.671016 1.638318 3.632640 4.587917 2.609420 9.155057 1.969738 0.220288 2.544696 3.975401 3.610754 0.307915 1.864637 3.779781 1.121703 0.991683 0.892668 0.429052 0.077901 0.810801 0.376838 0.378756 0.121681 2.584370 1.112486 1.800125 1.339652 ... 6 7 0 4.12 4.83 93 1195735 9 445900 272 766.80 1272.73 1019.76 19 70 74 72 30 6 1 18 30 0 10 14 2 4.53 5.02 149 1625130 17 564843 483 765.93 1269.23 1017.58 28 130 129 131 50 14 1 35 61 0 17 21 3 5.06 8.62 305 3420907 60 2296870 1068 853.03 1410.45 1131.74 63 266 267 262 149 57 4 70 121 1 40 77 5 8.38 10.92 689 8404624 114 3503058 2283 853.88 1411.45 1132.66 143 566 578 552 319 108 17 135 236 2 91 195 14 16331452 2011 9 5 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

5 rows × 345 columns

In [65]:
test_data.head()
Out[65]:
id full_sq life_sq floor max_floor material build_year num_room kitch_sq state product_type sub_area area_m raion_popul green_zone_part indust_part children_preschool preschool_quota preschool_education_centers_raion children_school school_quota school_education_centers_raion school_education_centers_top_20_raion hospital_beds_raion healthcare_centers_raion university_top_20_raion sport_objects_raion additional_education_raion culture_objects_top_25 culture_objects_top_25_raion shopping_centers_raion office_raion thermal_power_plant_raion incineration_raion oil_chemistry_raion radiation_raion railroad_terminal_raion big_market_raion nuclear_reactor_raion detention_facility_raion full_all male_f female_f young_all young_male young_female work_all work_male work_female ekder_all ekder_male ekder_female 0_6_all 0_6_male 0_6_female 7_14_all 7_14_male 7_14_female 0_17_all 0_17_male 0_17_female 16_29_all 16_29_male 16_29_female 0_13_all 0_13_male 0_13_female raion_build_count_with_material_info build_count_block build_count_wood build_count_frame build_count_brick build_count_monolith build_count_panel build_count_foam build_count_slag build_count_mix raion_build_count_with_builddate_info build_count_before_1920 build_count_1921-1945 build_count_1946-1970 build_count_1971-1995 build_count_after_1995 ID_metro metro_min_avto metro_km_avto metro_min_walk metro_km_walk kindergarten_km school_km park_km green_zone_km industrial_km water_treatment_km cemetery_km incineration_km railroad_station_walk_km railroad_station_walk_min ID_railroad_station_walk railroad_station_avto_km railroad_station_avto_min ID_railroad_station_avto public_transport_station_km public_transport_station_min_walk water_km water_1line mkad_km ttk_km sadovoe_km bulvar_ring_km kremlin_km big_road1_km ID_big_road1 big_road1_1line big_road2_km ID_big_road2 railroad_km railroad_1line zd_vokzaly_avto_km ID_railroad_terminal bus_terminal_avto_km ID_bus_terminal oil_chemistry_km nuclear_reactor_km radiation_km power_transmission_line_km thermal_power_plant_km ts_km big_market_km market_shop_km fitness_km swim_pool_km ice_rink_km stadium_km basketball_km hospice_morgue_km detention_facility_km public_healthcare_km university_km workplaces_km shopping_centers_km office_km additional_education_km preschool_km big_church_km church_synagogue_km mosque_km theater_km museum_km exhibition_km ... cafe_count_1000_price_4000 cafe_count_1000_price_high big_church_count_1000 church_count_1000 mosque_count_1000 leisure_count_1000 sport_count_1000 market_count_1000 green_part_1500 prom_part_1500 office_count_1500 office_sqm_1500 trc_count_1500 trc_sqm_1500 cafe_count_1500 cafe_sum_1500_min_price_avg cafe_sum_1500_max_price_avg cafe_avg_price_1500 cafe_count_1500_na_price cafe_count_1500_price_500 cafe_count_1500_price_1000 cafe_count_1500_price_1500 cafe_count_1500_price_2500 cafe_count_1500_price_4000 cafe_count_1500_price_high big_church_count_1500 church_count_1500 mosque_count_1500 leisure_count_1500 sport_count_1500 market_count_1500 green_part_2000 prom_part_2000 office_count_2000 office_sqm_2000 trc_count_2000 trc_sqm_2000 cafe_count_2000 cafe_sum_2000_min_price_avg cafe_sum_2000_max_price_avg cafe_avg_price_2000 cafe_count_2000_na_price cafe_count_2000_price_500 cafe_count_2000_price_1000 cafe_count_2000_price_1500 cafe_count_2000_price_2500 cafe_count_2000_price_4000 cafe_count_2000_price_high big_church_count_2000 church_count_2000 mosque_count_2000 leisure_count_2000 sport_count_2000 market_count_2000 green_part_3000 prom_part_3000 office_count_3000 office_sqm_3000 trc_count_3000 trc_sqm_3000 cafe_count_3000 cafe_sum_3000_min_price_avg cafe_sum_3000_max_price_avg cafe_avg_price_3000 cafe_count_3000_na_price cafe_count_3000_price_500 cafe_count_3000_price_1000 cafe_count_3000_price_1500 cafe_count_3000_price_2500 cafe_count_3000_price_4000 cafe_count_3000_price_high big_church_count_3000 church_count_3000 mosque_count_3000 leisure_count_3000 sport_count_3000 market_count_3000 green_part_5000 prom_part_5000 office_count_5000 office_sqm_5000 trc_count_5000 trc_sqm_5000 cafe_count_5000 cafe_sum_5000_min_price_avg cafe_sum_5000_max_price_avg cafe_avg_price_5000 cafe_count_5000_na_price cafe_count_5000_price_500 cafe_count_5000_price_1000 cafe_count_5000_price_1500 cafe_count_5000_price_2500 cafe_count_5000_price_4000 cafe_count_5000_price_high big_church_count_5000 church_count_5000 mosque_count_5000 leisure_count_5000 sport_count_5000 market_count_5000 year month day life_sqnan build_yearnan statenan preschool_quotanan school_quotanan hospital_beds_raionnan raion_build_count_with_material_infonan build_count_blocknan build_count_woodnan build_count_framenan build_count_bricknan build_count_monolithnan build_count_panelnan build_count_foamnan build_count_slagnan build_count_mixnan raion_build_count_with_builddate_infonan build_count_before_1920nan build_count_1921-1945nan build_count_1946-1970nan build_count_1971-1995nan build_count_after_1995nan metro_min_walknan metro_km_walknan railroad_station_walk_kmnan railroad_station_walk_minnan ID_railroad_station_walknan cafe_sum_500_min_price_avgnan cafe_sum_500_max_price_avgnan cafe_avg_price_500nan cafe_sum_1000_min_price_avgnan cafe_sum_1000_max_price_avgnan cafe_avg_price_1000nan cafe_sum_1500_min_price_avgnan cafe_sum_1500_max_price_avgnan cafe_avg_price_1500nan green_part_2000nan cafe_sum_2000_min_price_avgnan cafe_sum_2000_max_price_avgnan cafe_avg_price_2000nan cafe_sum_3000_min_price_avgnan cafe_sum_3000_max_price_avgnan cafe_avg_price_3000nan prom_part_5000nan cafe_sum_5000_min_price_avgnan cafe_sum_5000_max_price_avgnan cafe_avg_price_5000nan
0 30474 39.0 20.7 2 9 1 1998.0 1 8.9 3.0 7661 7661 2.615514e+07 178264 0.137846 0.041116 14080 11926.0 11 14892 24750.0 13 1 990.0 1 0 13 4 7661 0 4 4 7661 7661 7661 7661 7661 7661 7661 7661 102618 47681 54937 30808 16251 14557 121369 59138 62231 26087 7410 18677 14080 7457 6623 14892 7839 7053 34341 18094 16247 19906 9676 10230 27123 14340 12783 1681.0 173.0 607.0 19.0 245.0 116.0 431.0 1.0 84.0 5.0 1680.0 34.0 299.0 439.0 109.0 799.0 45 1.258957 0.735908 8.830901 0.735908 0.078502 0.746962 2.048487 0.061485 1.205404 0.967696 0.781053 10.56540 4.812102 57.745220 39.0 4.850748 6.274963 39 0.114134 1.369603 0.248151 7661 6.374826 19.651101 22.790985 24.079707 24.779082 4.152246 2 7661 5.706484 38 0.490549 7661 27.553486 32 8.424959 9 22.624362 16.224083 6.620081 4.121874 8.957780 8.824060 15.483912 5.353674 0.225788 3.673942 11.810839 20.392427 9.131977 3.300120 25.462741 1.613152 17.214870 7.922610 2.414138 4.923614 0.514211 0.746962 0.749142 0.848297 1.917736 19.953413 14.052207 12.228576 ... 0 0 1 1 0 0 3 0 20.14 0.70 0 0 0 0 2 1000.00 1500.00 1250.00 0 0 0 2 0 0 0 1 2 0 0 4 0 15.17 1.18 0 0 0 0 3 1000.00 1500.00 1250.00 0 0 0 3 0 0 0 1 2 1 0 5 0 14.69 2.87 0 0 3 73000 12 781.82 1227.27 1004.55 1 2 2 7 0 0 0 1 3 1 0 7 0 21.58 4.69 1 37550 8 299166 19 676.47 1088.24 882.35 2 5 4 8 0 0 0 1 10 1 0 14 1 2015 7 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1 30475 79.2 30.4 8 17 1 0.0 3 1.0 1.0 7661 7661 2.553630e+07 4001 0.496315 0.007122 275 3062.0 0 264 6974.0 0 0 990.0 0 0 0 0 7661 0 1 0 7661 7661 7661 7661 7661 7661 7661 7661 17790 8350 9443 574 297 277 2566 1356 1211 861 244 617 275 143 133 264 136 128 646 336 311 3796 2035 1762 506 261 245 295.0 44.0 1.0 0.0 84.0 7.0 92.0 0.0 0.0 0.0 295.0 0.0 2.0 144.0 73.0 31.0 21 4.230425 3.444625 41.335498 3.444625 1.192193 1.332570 4.400427 0.000000 0.742377 16.049420 2.244906 18.50054 5.458057 65.496687 24.0 5.458057 6.859956 24 0.826083 9.912993 0.799853 7661 6.847813 16.975793 19.692960 20.864427 21.722620 2.148398 13 7661 4.410488 27 2.342346 7661 27.421853 50 16.913175 8 29.425443 17.080113 8.545593 4.932827 10.039833 3.654955 15.092542 8.156185 1.313180 4.244082 4.438242 13.445121 8.332180 7.095895 26.807204 3.775300 12.440198 9.672779 1.764298 3.764819 1.694967 1.332570 1.672126 1.162371 12.239901 13.006107 9.661063 4.323941 ... 0 0 0 0 0 0 0 0 59.28 5.31 0 0 0 0 3 833.33 1500.00 1166.67 0 0 2 0 1 0 0 0 1 0 0 1 0 49.26 4.06 0 0 1 5000 7 757.14 1285.71 1021.43 0 1 3 2 1 0 0 1 2 0 0 1 0 39.50 3.32 0 0 2 22000 10 680.00 1200.00 940.00 0 1 6 2 1 0 0 1 5 0 0 7 0 39.10 7.70 2 177300 6 231300 20 733.33 1250.00 991.67 2 4 8 4 1 1 0 2 11 0 1 12 1 2015 7 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2 30476 40.5 25.1 3 5 2 1960.0 2 4.8 2.0 7661 7661 9.946335e+06 139322 0.065409 0.225825 6400 2232.0 7 6558 7966.0 7 0 1548.0 3 0 13 0 7661 0 2 7 7661 7661 7661 7661 7661 7661 7661 7661 36154 16222 19932 13799 6937 6862 91795 44734 47061 33728 9653 24075 6400 3209 3191 6558 3317 3241 15514 7813 7701 8137 3787 4350 12162 6117 6045 561.0 111.0 0.0 0.0 254.0 3.0 189.0 0.0 4.0 0.0 561.0 0.0 5.0 437.0 79.0 40.0 44 1.585306 1.122214 13.466563 1.122214 0.065324 0.194608 2.513006 0.580638 0.900408 11.749900 3.389848 10.19563 3.628293 43.539514 68.0 3.977659 5.375048 59 0.116686 1.400229 1.384824 7661 3.499380 5.627481 8.090528 8.671086 10.320728 0.580638 10 7661 3.499380 1 2.220941 7661 10.093318 5 7.921607 3 1.823381 14.431252 0.826743 2.388288 3.760642 3.290966 16.304596 0.644830 0.966254 1.332737 3.131143 1.464174 1.499581 0.487817 6.718082 0.711768 4.862872 3.506298 1.456661 1.223804 2.330995 0.194608 1.400094 1.177527 9.938735 2.983875 1.988346 0.794245 ... 0 0 0 0 0 1 6 2 30.97 8.75 2 34100 1 0 19 655.56 1111.11 883.33 1 6 6 4 2 0 0 1 1 0 1 12 3 40.90 10.51 6 80237 3 14090 28 633.33 1092.59 862.96 1 7 12 6 2 0 0 2 2 0 4 14 4 45.86 9.08 8 215237 6 39106 37 608.33 1069.44 838.89 1 8 19 7 2 0 0 2 3 0 5 22 4 25.62 13.59 27 427889 26 1024431 179 668.97 1132.18 900.57 5 53 64 42 11 4 0 10 21 0 10 71 11 2015 7 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3 30477 62.8 36.0 17 17 1 2016.0 2 62.8 3.0 7661 7661 2.149409e+07 7122 0.262459 0.017647 489 3062.0 0 469 6974.0 0 0 990.0 0 0 0 2 7661 0 0 0 7661 7661 7661 7661 7661 7661 7661 7661 9553 4529 5024 1021 529 493 4568 2414 2155 1533 435 1099 489 254 236 469 242 228 1150 597 553 2155 1206 950 900 465 435 295.0 44.0 1.0 0.0 84.0 7.0 92.0 0.0 0.0 0.0 295.0 0.0 2.0 144.0 73.0 31.0 45 7.931398 6.038848 68.559794 5.713316 3.189083 3.540105 5.612835 0.025446 0.466738 5.061917 2.701804 14.62944 10.284167 123.410001 39.0 10.609698 13.517419 39 3.093209 37.118504 0.233017 7661 8.928836 22.094252 25.062928 26.226045 26.960463 2.722667 38 7661 8.601110 2 4.476081 7661 37.436772 50 13.979650 8 26.895118 19.942295 9.434351 6.218331 13.345715 10.480798 10.723870 11.112624 4.480234 8.577223 15.200509 18.560234 12.253021 6.831966 30.366022 5.731266 15.382678 11.306566 6.589381 8.102094 0.403429 3.540105 5.411312 0.213853 6.153091 18.121220 16.938290 14.171229 ... 0 0 0 2 0 0 0 0 28.82 3.59 0 0 0 0 1 300.00 500.00 400.00 0 1 0 0 0 0 0 0 3 0 0 0 0 31.35 2.99 0 0 0 0 1 300.00 500.00 400.00 0 1 0 0 0 0 0 0 3 0 0 0 0 34.87 1.34 0 0 0 0 1 300.00 500.00 400.00 0 1 0 0 0 0 0 0 4 0 0 0 0 24.25 1.66 0 0 0 0 5 1560.00 2500.00 2030.00 0 1 0 1 1 2 0 0 10 0 0 2 0 2015 7 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
4 30478 40.0 40.0 17 17 1 0.0 1 1.0 1.0 7661 7661 2.553630e+07 4001 0.496315 0.007122 275 3062.0 0 264 6974.0 0 0 990.0 0 0 0 0 7661 0 1 0 7661 7661 7661 7661 7661 7661 7661 7661 17790 8350 9443 574 297 277 2566 1356 1211 861 244 617 275 143 133 264 136 128 646 336 311 3796 2035 1762 506 261 245 295.0 44.0 1.0 0.0 84.0 7.0 92.0 0.0 0.0 0.0 295.0 0.0 2.0 144.0 73.0 31.0 21 2.152792 1.722233 20.666800 1.722233 0.897889 1.234235 4.566595 0.427248 0.353642 16.784630 2.250137 19.14919 3.735666 44.827989 24.0 3.735666 4.782323 24 0.630014 7.560163 0.394422 7661 7.123215 17.148737 19.868997 21.038561 21.905792 2.808077 13 7661 3.688405 27 1.727223 7661 25.699461 50 17.366661 8 29.968660 17.397666 9.036942 5.506770 10.102328 3.729416 15.546028 6.433794 1.519553 2.521691 2.715850 13.898607 8.355285 7.401423 25.084813 2.052908 12.893684 9.479093 1.806570 4.338453 1.339078 1.234235 1.192543 1.186621 12.652956 13.459593 9.890758 4.555385 ... 0 0 0 0 0 0 0 0 43.85 1.55 0 0 0 0 3 833.33 1500.00 1166.67 0 0 2 0 1 0 0 1 1 0 0 0 0 38.61 3.12 0 0 2 22000 7 757.14 1285.71 1021.43 0 1 3 2 1 0 0 1 2 0 0 3 0 41.64 2.11 0 0 2 22000 9 700.00 1222.22 961.11 0 1 5 2 1 0 0 1 4 0 0 6 0 35.62 6.96 1 117300 4 201300 20 747.37 1263.16 1005.26 1 4 8 5 1 1 0 2 12 0 1 11 1 2015 7 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

5 rows × 340 columns

Performing log normal distribution on numerical continous variables

In [66]:
continuous_feature
Out[66]:
['full_sq',
 'life_sq',
 'floor',
 'max_floor',
 'build_year',
 'kitch_sq',
 'area_m',
 'raion_popul',
 'green_zone_part',
 'indust_part',
 'children_preschool',
 'preschool_quota',
 'children_school',
 'school_quota',
 'hospital_beds_raion',
 'office_raion',
 'full_all',
 'male_f',
 'female_f',
 'young_all',
 'young_male',
 'young_female',
 'work_all',
 'work_male',
 'work_female',
 'ekder_all',
 'ekder_male',
 'ekder_female',
 '0_6_all',
 '0_6_male',
 '0_6_female',
 '7_14_all',
 '7_14_male',
 '7_14_female',
 '0_17_all',
 '0_17_male',
 '0_17_female',
 '16_29_all',
 '16_29_male',
 '16_29_female',
 '0_13_all',
 '0_13_male',
 '0_13_female',
 'raion_build_count_with_material_info',
 'build_count_block',
 'build_count_wood',
 'build_count_brick',
 'build_count_monolith',
 'build_count_panel',
 'build_count_before_1920',
 'build_count_1921-1945',
 'build_count_1946-1970',
 'build_count_1971-1995',
 'build_count_after_1995',
 'ID_metro',
 'metro_min_avto',
 'metro_km_avto',
 'metro_min_walk',
 'metro_km_walk',
 'kindergarten_km',
 'school_km',
 'park_km',
 'green_zone_km',
 'industrial_km',
 'water_treatment_km',
 'cemetery_km',
 'incineration_km',
 'railroad_station_walk_km',
 'railroad_station_walk_min',
 'ID_railroad_station_walk',
 'railroad_station_avto_km',
 'railroad_station_avto_min',
 'ID_railroad_station_avto',
 'public_transport_station_km',
 'public_transport_station_min_walk',
 'water_km',
 'mkad_km',
 'ttk_km',
 'sadovoe_km',
 'bulvar_ring_km',
 'kremlin_km',
 'big_road1_km',
 'ID_big_road1',
 'big_road2_km',
 'ID_big_road2',
 'railroad_km',
 'zd_vokzaly_avto_km',
 'bus_terminal_avto_km',
 'oil_chemistry_km',
 'nuclear_reactor_km',
 'radiation_km',
 'power_transmission_line_km',
 'thermal_power_plant_km',
 'ts_km',
 'big_market_km',
 'market_shop_km',
 'fitness_km',
 'swim_pool_km',
 'ice_rink_km',
 'stadium_km',
 'basketball_km',
 'hospice_morgue_km',
 'detention_facility_km',
 'public_healthcare_km',
 'university_km',
 'workplaces_km',
 'shopping_centers_km',
 'office_km',
 'additional_education_km',
 'preschool_km',
 'big_church_km',
 'church_synagogue_km',
 'mosque_km',
 'theater_km',
 'museum_km',
 'exhibition_km',
 'catering_km',
 'green_part_500',
 'prom_part_500',
 'office_count_500',
 'office_sqm_500',
 'trc_sqm_500',
 'cafe_count_500',
 'cafe_sum_500_min_price_avg',
 'cafe_sum_500_max_price_avg',
 'cafe_avg_price_500',
 'cafe_count_500_price_500',
 'cafe_count_500_price_1000',
 'cafe_count_500_price_1500',
 'green_part_1000',
 'prom_part_1000',
 'office_count_1000',
 'office_sqm_1000',
 'trc_sqm_1000',
 'cafe_count_1000',
 'cafe_sum_1000_min_price_avg',
 'cafe_sum_1000_max_price_avg',
 'cafe_avg_price_1000',
 'cafe_count_1000_na_price',
 'cafe_count_1000_price_500',
 'cafe_count_1000_price_1000',
 'cafe_count_1000_price_1500',
 'cafe_count_1000_price_2500',
 'cafe_count_1000_price_4000',
 'church_count_1000',
 'leisure_count_1000',
 'green_part_1500',
 'prom_part_1500',
 'office_count_1500',
 'office_sqm_1500',
 'trc_count_1500',
 'trc_sqm_1500',
 'cafe_count_1500',
 'cafe_sum_1500_min_price_avg',
 'cafe_sum_1500_max_price_avg',
 'cafe_avg_price_1500',
 'cafe_count_1500_na_price',
 'cafe_count_1500_price_500',
 'cafe_count_1500_price_1000',
 'cafe_count_1500_price_1500',
 'cafe_count_1500_price_2500',
 'cafe_count_1500_price_4000',
 'big_church_count_1500',
 'church_count_1500',
 'leisure_count_1500',
 'sport_count_1500',
 'green_part_2000',
 'prom_part_2000',
 'office_count_2000',
 'office_sqm_2000',
 'trc_count_2000',
 'trc_sqm_2000',
 'cafe_count_2000',
 'cafe_sum_2000_min_price_avg',
 'cafe_sum_2000_max_price_avg',
 'cafe_avg_price_2000',
 'cafe_count_2000_na_price',
 'cafe_count_2000_price_500',
 'cafe_count_2000_price_1000',
 'cafe_count_2000_price_1500',
 'cafe_count_2000_price_2500',
 'cafe_count_2000_price_4000',
 'big_church_count_2000',
 'church_count_2000',
 'leisure_count_2000',
 'sport_count_2000',
 'green_part_3000',
 'prom_part_3000',
 'office_count_3000',
 'office_sqm_3000',
 'trc_count_3000',
 'trc_sqm_3000',
 'cafe_count_3000',
 'cafe_sum_3000_min_price_avg',
 'cafe_sum_3000_max_price_avg',
 'cafe_avg_price_3000',
 'cafe_count_3000_na_price',
 'cafe_count_3000_price_500',
 'cafe_count_3000_price_1000',
 'cafe_count_3000_price_1500',
 'cafe_count_3000_price_2500',
 'cafe_count_3000_price_4000',
 'big_church_count_3000',
 'church_count_3000',
 'leisure_count_3000',
 'sport_count_3000',
 'green_part_5000',
 'prom_part_5000',
 'office_count_5000',
 'office_sqm_5000',
 'trc_count_5000',
 'trc_sqm_5000',
 'cafe_count_5000',
 'cafe_sum_5000_min_price_avg',
 'cafe_sum_5000_max_price_avg',
 'cafe_avg_price_5000',
 'cafe_count_5000_na_price',
 'cafe_count_5000_price_500',
 'cafe_count_5000_price_1000',
 'cafe_count_5000_price_1500',
 'cafe_count_5000_price_2500',
 'cafe_count_5000_price_4000',
 'cafe_count_5000_price_high',
 'big_church_count_5000',
 'church_count_5000',
 'leisure_count_5000',
 'sport_count_5000',
 'price_doc']
In [67]:
import numpy as np
num_features=continuous_feature

for feature in num_features:
    train_data[feature]=np.log(train_data[feature])
In [68]:
for feature in continuous_feature_test:
    data=test_data.copy()
    if 0 in data[feature].unique():
        pass
    else:
        data[feature]=np.log(data[feature])
        #data['SalePrice']=np.log(data['SalePrice'])
        #plt.scatter(data[feature],data['SalePrice'])
        plt.xlabel(feature)
        plt.ylabel('SalesPrice')
        plt.title(feature)
        plt.show()
In [69]:
import numpy as np
num_features=continuous_feature_test

for feature in num_features:
    train_data[feature]=np.log(train_data[feature])
In [70]:
train_data.head()
Out[70]:
id full_sq life_sq floor max_floor material build_year num_room kitch_sq state product_type sub_area area_m raion_popul green_zone_part indust_part children_preschool preschool_quota preschool_education_centers_raion children_school school_quota school_education_centers_raion school_education_centers_top_20_raion hospital_beds_raion healthcare_centers_raion university_top_20_raion sport_objects_raion additional_education_raion culture_objects_top_25 culture_objects_top_25_raion shopping_centers_raion office_raion thermal_power_plant_raion incineration_raion oil_chemistry_raion radiation_raion railroad_terminal_raion big_market_raion nuclear_reactor_raion detention_facility_raion full_all male_f female_f young_all young_male young_female work_all work_male work_female ekder_all ekder_male ekder_female 0_6_all 0_6_male 0_6_female 7_14_all 7_14_male 7_14_female 0_17_all 0_17_male 0_17_female 16_29_all 16_29_male 16_29_female 0_13_all 0_13_male 0_13_female raion_build_count_with_material_info build_count_block build_count_wood build_count_frame build_count_brick build_count_monolith build_count_panel build_count_foam build_count_slag build_count_mix raion_build_count_with_builddate_info build_count_before_1920 build_count_1921-1945 build_count_1946-1970 build_count_1971-1995 build_count_after_1995 ID_metro metro_min_avto metro_km_avto metro_min_walk metro_km_walk kindergarten_km school_km park_km green_zone_km industrial_km water_treatment_km cemetery_km incineration_km railroad_station_walk_km railroad_station_walk_min ID_railroad_station_walk railroad_station_avto_km railroad_station_avto_min ID_railroad_station_avto public_transport_station_km public_transport_station_min_walk water_km water_1line mkad_km ttk_km sadovoe_km bulvar_ring_km kremlin_km big_road1_km ID_big_road1 big_road1_1line big_road2_km ID_big_road2 railroad_km railroad_1line zd_vokzaly_avto_km ID_railroad_terminal bus_terminal_avto_km ID_bus_terminal oil_chemistry_km nuclear_reactor_km radiation_km power_transmission_line_km thermal_power_plant_km ts_km big_market_km market_shop_km fitness_km swim_pool_km ice_rink_km stadium_km basketball_km hospice_morgue_km detention_facility_km public_healthcare_km university_km workplaces_km shopping_centers_km office_km additional_education_km preschool_km big_church_km church_synagogue_km mosque_km theater_km museum_km exhibition_km ... leisure_count_1000 sport_count_1000 market_count_1000 green_part_1500 prom_part_1500 office_count_1500 office_sqm_1500 trc_count_1500 trc_sqm_1500 cafe_count_1500 cafe_sum_1500_min_price_avg cafe_sum_1500_max_price_avg cafe_avg_price_1500 cafe_count_1500_na_price cafe_count_1500_price_500 cafe_count_1500_price_1000 cafe_count_1500_price_1500 cafe_count_1500_price_2500 cafe_count_1500_price_4000 cafe_count_1500_price_high big_church_count_1500 church_count_1500 mosque_count_1500 leisure_count_1500 sport_count_1500 market_count_1500 green_part_2000 prom_part_2000 office_count_2000 office_sqm_2000 trc_count_2000 trc_sqm_2000 cafe_count_2000 cafe_sum_2000_min_price_avg cafe_sum_2000_max_price_avg cafe_avg_price_2000 cafe_count_2000_na_price cafe_count_2000_price_500 cafe_count_2000_price_1000 cafe_count_2000_price_1500 cafe_count_2000_price_2500 cafe_count_2000_price_4000 cafe_count_2000_price_high big_church_count_2000 church_count_2000 mosque_count_2000 leisure_count_2000 sport_count_2000 market_count_2000 green_part_3000 prom_part_3000 office_count_3000 office_sqm_3000 trc_count_3000 trc_sqm_3000 cafe_count_3000 cafe_sum_3000_min_price_avg cafe_sum_3000_max_price_avg cafe_avg_price_3000 cafe_count_3000_na_price cafe_count_3000_price_500 cafe_count_3000_price_1000 cafe_count_3000_price_1500 cafe_count_3000_price_2500 cafe_count_3000_price_4000 cafe_count_3000_price_high big_church_count_3000 church_count_3000 mosque_count_3000 leisure_count_3000 sport_count_3000 market_count_3000 green_part_5000 prom_part_5000 office_count_5000 office_sqm_5000 trc_count_5000 trc_sqm_5000 cafe_count_5000 cafe_sum_5000_min_price_avg cafe_sum_5000_max_price_avg cafe_avg_price_5000 cafe_count_5000_na_price cafe_count_5000_price_500 cafe_count_5000_price_1000 cafe_count_5000_price_1500 cafe_count_5000_price_2500 cafe_count_5000_price_4000 cafe_count_5000_price_high big_church_count_5000 church_count_5000 mosque_count_5000 leisure_count_5000 sport_count_5000 market_count_5000 price_doc year month day life_sqnan floornan max_floornan materialnan build_yearnan num_roomnan kitch_sqnan statenan preschool_quotanan school_quotanan hospital_beds_raionnan raion_build_count_with_material_infonan build_count_blocknan build_count_woodnan build_count_framenan build_count_bricknan build_count_monolithnan build_count_panelnan build_count_foamnan build_count_slagnan build_count_mixnan raion_build_count_with_builddate_infonan build_count_before_1920nan build_count_1921-1945nan build_count_1946-1970nan build_count_1971-1995nan build_count_after_1995nan metro_min_walknan metro_km_walknan railroad_station_walk_kmnan railroad_station_walk_minnan ID_railroad_station_walknan cafe_sum_500_min_price_avgnan cafe_sum_500_max_price_avgnan cafe_avg_price_500nan cafe_sum_1000_min_price_avgnan cafe_sum_1000_max_price_avgnan cafe_avg_price_1000nan cafe_sum_1500_min_price_avgnan cafe_sum_1500_max_price_avgnan cafe_avg_price_1500nan cafe_sum_2000_min_price_avgnan cafe_sum_2000_max_price_avgnan cafe_avg_price_2000nan cafe_sum_3000_min_price_avgnan cafe_sum_3000_max_price_avgnan cafe_avg_price_3000nan prom_part_5000nan cafe_sum_5000_min_price_avgnan cafe_sum_5000_max_price_avgnan cafe_avg_price_5000nan
0 0.000000 1.324738 1.192660 0.326634 0.910235 1.0 2.026877 2.0 0.583198 2.0 1 16 2.751939 2.481138 NaN NaN 2.215612 2.142110 5 2.223625 2.231255 5 0 1.701222 1 0 7 3 0 0 16 -inf 0 1 1 0 0 1 0 0 2.430494 2.361655 2.373089 2.298535 2.230690 2.221910 2.441898 2.385483 2.373498 2.351101 2.226430 2.317627 2.215612 2.139688 2.134215 2.223625 2.152431 2.138407 2.309474 2.242433 2.233670 2.279359 2.213876 2.196949 2.285827 2.217115 2.208148 1.677444 1.169032 NaN 0.0 NaN -0.366513 1.651527 0.0 0.0 0.0 5.351858 NaN NaN NaN 1.672953 0.475885 -inf -0.049452 -2.092876 0.958675 -2.092876 NaN NaN -0.262074 NaN -2.553283 1.152083 -0.527503 0.254759 0.524774 1.429110 -inf 0.524774 0.658750 -inf NaN 0.177199 NaN 1 -1.043160 0.871489 0.944940 0.961500 1.000047 -1.043160 -inf 0 0.294990 0.475885 -1.323037 1 0.976629 101 1.160072 1 1.064297 0.556015 -1.657353 -2.802852 0.565480 0.378781 0.867461 -0.660556 NaN 0.113385 -2.280919 0.740911 0.229102 -0.136648 0.369117 NaN 0.644140 NaN NaN NaN NaN NaN NaN NaN 0.314196 0.971854 0.693177 0.667466 ... -inf 6 1 0.977634 0.659805 0.094048 2.359478 2.197225 2.489220 1.260266 1.846844 1.928181 1.893718 -inf 0.970422 0.874591 0.583198 -0.366513 NaN 0 -inf -0.366513 0 NaN 0.665730 1 0.902416 1.019104 0.787195 2.497225 1.079918 2.641523 1.276345 1.859491 1.938702 1.905044 -inf 0.996229 0.874591 0.583198 -0.366513 -inf 0 -inf -0.366513 0 NaN 0.834032 1 0.909564 0.957965 0.910235 2.520548 1.142787 2.650817 1.439718 1.865779 1.943641 1.910486 0.475885 1.113344 1.128508 1.019781 0.094048 -inf 0 -0.366513 0.326634 0 NaN 1.113344 1 0.944625 0.951085 1.214110 2.610184 1.374030 2.722013 1.614203 1.881486 1.957005 1.924752 0.910235 1.298436 1.353565 1.305323 0.787195 0.326634 NaN 0.941939 1.128508 1 NaN 1.374030 4 15.581952 2011 8 2.995732 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1 0.693147 1.260266 1.079918 0.094048 0.910235 1.0 2.026877 2.0 0.583198 2.0 1 17 2.777338 2.455799 NaN NaN 2.178877 2.085084 5 2.192392 2.167710 8 0 1.692624 1 0 6 1 1 1 3 NaN 0 1 1 0 0 1 0 0 2.419676 2.345643 2.365318 2.268318 2.194752 2.193009 2.412248 2.349537 2.346678 2.331153 2.212015 2.293716 2.178877 2.098111 2.096254 2.192392 2.112760 2.110919 2.280474 2.208797 2.205116 2.264538 2.189649 2.189974 2.254158 2.179679 2.177552 1.704977 1.485877 -inf 0.0 1.436201 0.326634 1.504035 0.0 0.0 0.0 5.497168 -inf -inf 1.601979 1.488584 0.996229 -0.366513 NaN NaN 0.708459 NaN NaN NaN NaN NaN NaN -1.288377 0.430446 0.768902 0.204814 1.311626 -0.366513 0.256556 0.433886 -0.366513 NaN NaN NaN 1 0.811663 0.124596 0.622296 0.739976 0.771542 0.058597 -0.366513 0 0.124596 0.326634 NaN 1 0.799227 32 0.554769 2 0.788942 0.223054 0.002207 -1.513865 0.206621 NaN 0.659100 0.207842 NaN -0.366402 0.785818 0.594827 -1.898586 -0.068666 0.931238 -0.940322 -0.483586 NaN NaN NaN -2.661836 NaN NaN NaN 0.455672 0.653006 NaN -0.152953 ... 1.386294 2 0 1.121498 0.714183 0.094048 2.445959 1.945910 2.464062 1.041412 1.878341 1.959385 1.925192 NaN 0.583198 0.665730 -inf -0.366513 -inf 0 -inf 0.475885 0 0.326634 0.787195 0 1.133890 1.084348 0.326634 2.486305 0.732099 2.492834 1.113344 1.878588 1.957572 1.924091 NaN 0.665730 0.732099 0.094048 -0.366513 -inf 0 -inf 0.475885 0 0.326634 0.874591 0 1.062727 1.196229 0.910235 2.611501 0.970422 2.573021 1.224128 1.863670 1.944545 1.910331 -inf 0.874591 0.874591 0.326634 -0.366513 -inf 0 -inf 0.665730 0 0.583198 1.079918 1 0.845118 1.197883 1.432618 2.694980 1.305323 2.675939 1.644061 1.873792 1.952528 1.919109 0.787195 1.358877 1.428968 1.276345 0.996229 0.094048 NaN 0.996229 1.214110 1 0.834032 1.432618 14 15.607270 2011 8 3.135494 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2 1.098612 1.324738 1.214110 -0.366513 0.910235 1.0 2.026877 2.0 0.583198 2.0 1 16 2.733448 2.444940 NaN NaN 2.160923 1.986263 4 2.167159 2.154890 7 0 1.956682 1 0 5 1 0 0 0 -inf 0 1 1 1 0 1 0 0 2.445174 2.373793 2.391642 2.248642 2.178134 2.166900 2.403066 2.338688 2.337964 2.316315 2.190210 2.280376 2.160923 2.084123 2.070860 2.167159 2.090906 2.077625 2.262600 2.193280 2.181843 2.289812 2.209369 2.224118 2.233053 2.161645 2.149694 1.757701 1.405493 NaN 0.0 1.672953 0.326634 1.409607 0.0 1.0 0.0 5.799093 -inf NaN 1.705717 1.421453 1.097189 0.094048 -0.285169 -0.706286 1.048609 -0.997573 NaN NaN NaN NaN NaN 0.464875 0.197371 0.910117 -1.406380 1.004278 0.094048 -1.406380 -0.632122 0.094048 NaN 0.316610 NaN 1 0.544429 0.071524 0.663032 0.735344 0.790609 NaN 0.094048 0 0.071524 0.326634 NaN 1 0.813394 5 0.643770 3 0.561872 0.701008 NaN -0.752118 0.265114 0.239312 0.559394 -1.143268 NaN -1.539213 -0.382216 NaN -0.401599 NaN 0.712420 NaN NaN -0.886323 -0.925214 -0.835263 NaN NaN 0.145388 NaN 0.727386 0.373192 0.139274 0.470657 ... -inf 5 3 0.830538 0.645309 NaN NaN 0.000000 2.062202 0.970422 1.832166 1.919969 1.883216 -0.366513 0.326634 0.583198 -0.366513 NaN NaN 0 NaN 0.326634 0 NaN 0.583198 5 0.941639 0.934339 0.326634 2.443644 0.665730 2.385963 1.156269 1.845998 1.929311 1.894162 -0.366513 0.732099 0.787195 0.326634 -inf NaN 0 NaN 0.326634 0 NaN 0.732099 5 0.914892 1.186511 0.732099 2.452382 0.665730 2.385963 1.311994 1.879071 1.957789 1.924402 -0.366513 0.787195 1.041412 0.787195 0.094048 -inf 0 NaN 0.874591 0 NaN 1.097189 6 0.961901 1.122253 1.324738 2.653686 1.268453 2.658054 1.569453 1.880213 1.958276 1.925122 0.834032 1.214110 1.336753 1.169032 0.834032 0.094048 NaN 0.874591 1.192660 0 0.326634 1.436201 10 15.555977 2011 8 3.295837 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3 1.386294 1.501549 1.364055 0.787195 0.910235 1.0 2.026877 2.0 0.583198 2.0 1 18 2.794099 2.492560 NaN NaN 2.249118 2.178200 9 2.253706 2.276720 10 0 1.931189 1 0 17 6 0 0 11 0.326634 0 1 1 0 0 1 0 0 2.298540 2.218441 2.233765 2.328240 2.261163 2.255328 2.459454 2.398147 2.398601 2.331476 2.209829 2.295111 2.249118 2.174936 2.171405 2.253706 2.182844 2.173194 2.339443 2.273352 2.267146 2.091772 1.985037 2.017381 2.314908 2.246628 2.241268 1.812684 0.787195 1.369104 12.0 1.572832 1.364055 1.668330 0.0 9.0 2.0 6.129050 0.941939 1.156269 1.305323 1.582588 1.710085 0.326634 -0.920957 NaN 0.895287 NaN NaN NaN NaN NaN NaN 1.011161 -0.354319 0.978887 0.376119 1.371568 0.326634 0.292083 0.508170 0.326634 NaN -0.783301 -1.700447 1 -0.015109 0.986364 1.050735 1.067265 1.088477 -0.015109 -inf 0 0.022361 1.041412 -0.367043 1 1.051159 83 0.645668 1 1.200047 0.812556 0.614241 -0.562769 0.881276 NaN 1.202481 NaN NaN -0.403479 0.625516 0.692908 0.466303 0.236512 0.776386 -0.258983 0.870897 NaN NaN NaN NaN NaN -3.464701 -0.808221 1.003526 1.041218 1.020716 0.479558 ... -inf 3 1 1.207671 0.632608 -0.366513 2.230622 1.945910 2.433780 1.142787 1.873815 1.950237 1.917629 NaN 0.475885 0.787195 0.732099 -inf NaN 0 -inf NaN 0 NaN 0.787195 2 1.245525 0.557165 -0.366513 2.230622 0.665730 2.433780 1.169032 1.870608 1.948918 1.915631 NaN 0.475885 0.874591 0.732099 -inf NaN 0 -inf -inf 0 NaN 0.941939 2 1.110037 0.241035 0.326634 2.487050 0.910235 2.504255 1.242925 1.883657 1.960881 1.928038 NaN 0.475885 0.970422 0.834032 0.094048 NaN 0 -inf -0.366513 0 NaN 1.061385 3 0.975251 0.306314 0.732099 2.518148 1.128508 2.621472 1.413636 1.922332 1.994388 1.963549 0.326634 0.665730 1.113344 0.996229 0.874591 -0.366513 -inf 0.326634 0.326634 0 NaN 1.181143 3 16.388123 2011 9 0.000000 0 0 1 1 1 1 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
4 1.609438 1.468751 1.468751 0.326634 0.910235 1.0 2.026877 2.0 0.583198 2.0 1 16 2.769055 2.450269 NaN NaN 2.157475 2.089804 7 2.176682 2.192550 9 0 1.845537 4 2 25 2 0 0 10 1.511296 0 1 1 1 1 1 0 0 2.326920 2.252562 2.261000 2.251357 2.183365 2.167195 2.409455 2.345744 2.344535 2.321864 2.203340 2.283302 2.157475 2.079485 2.068110 2.176682 2.104903 2.083680 2.265037 2.197360 2.182813 2.145869 2.061368 2.061269 2.236598 2.167453 2.151254 1.889298 1.353565 NaN 0.0 1.866580 1.019781 1.268453 0.0 3.0 1.0 6.614726 1.777695 1.555235 1.606154 1.417583 1.378840 0.475885 -1.474574 NaN 0.747724 NaN NaN NaN NaN NaN NaN 0.862343 0.074408 0.906994 NaN 0.844595 0.475885 -0.760491 -0.263463 1.553373 NaN NaN NaN 1 0.897083 -0.609928 NaN NaN -0.054167 -0.609928 0.326634 0 0.132922 0.834032 NaN 0 -0.760491 113 -1.041095 4 0.628203 0.770102 -0.705888 0.254611 0.420962 -0.041731 0.794939 -0.388755 NaN -0.068267 0.322175 0.249915 NaN -0.473102 0.284928 -2.164146 NaN NaN NaN NaN NaN NaN NaN NaN -0.051839 -2.238699 -0.531273 -1.229598 ... 1.791759 7 0 0.347732 0.454158 1.511296 2.638648 2.197225 2.565553 1.723802 1.893447 1.966961 1.935473 1.079918 1.446565 1.459560 1.453174 1.224128 0.583198 1 1.061385 1.224128 0 0.834032 0.970422 2 0.412588 0.478362 1.610227 2.660336 1.041412 2.583568 1.821321 1.893276 1.966576 1.935164 1.203634 1.582588 1.581000 1.584161 1.364055 0.970422 1 1.268453 1.413636 0 1.041412 1.113344 3 0.483269 0.767366 1.744023 2.711073 1.409607 2.684239 1.942123 1.909364 1.981231 1.950402 1.421453 1.719815 1.720487 1.717098 1.610227 1.397 4 1.446565 1.567739 1 1.305323 1.468751 5 0.754171 0.871543 1.877209 2.769101 1.555235 2.712649 2.045529 1.909511 1.981329 1.950517 1.601979 1.846657 1.849961 1.842698 1.751838 1.543753 1.041412 1.590311 1.698150 2 1.506488 1.662599 14 16.608603 2011 9 1.609438 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

5 rows × 345 columns

FEATURE SCALING

In [71]:
scaling_feature=[feature for feature in train_data.columns if feature not in ['id','Price_doc'] ]
len(scaling_feature)
Out[71]:
344
In [72]:
train_data.fillna(0,inplace=True)
In [73]:
[min(train_data[i]) for i in train_data.columns]
Out[73]:
[0.0,
 -inf,
 -inf,
 -inf,
 -inf,
 1.0,
 -inf,
 0.0,
 -inf,
 1.0,
 0,
 0,
 2.6774989428536276,
 2.0595294527304104,
 0.0,
 0.0,
 1.641863663944111,
 0.0,
 0,
 1.633928354228994,
 1.9343700821476857,
 0,
 0,
 0.0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 -inf,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2.0595294527304104,
 1.9596328997183297,
 1.974243635478386,
 1.7749349530624492,
 1.6566548425881231,
 1.6440614850208792,
 2.0012332255705867,
 1.9110842330081819,
 1.894269181238364,
 1.8415452149350158,
 1.61935972946482,
 1.7873848498299765,
 1.641863663944111,
 1.50648771329857,
 1.4912513280732116,
 1.633928354228994,
 1.4964725796851872,
 1.4831304777411851,
 1.7948535467124367,
 1.6800782821876006,
 1.6654905987582682,
 1.8491427716078928,
 1.7457329447065377,
 1.710800555208106,
 1.7534605989414096,
 1.631588328253238,
 1.61935972946482,
 -inf,
 -inf,
 -inf,
 0.0,
 -inf,
 -inf,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 -inf,
 -inf,
 -inf,
 -inf,
 -inf,
 -inf,
 -7.960598786346565,
 -9.839206968048753,
 -5.756901923145519,
 -9.839206968048753,
 -6.409107837255395,
 -7.363242555871257,
 -9.895921628063967,
 -5.71074990895329,
 -10.516654798246318,
 -5.596157089002031,
 -9.152124461512093,
 -5.301011695861906,
 -7.979591133121632,
 -2.364158535275807,
 -inf,
 -6.568681126087842,
 -8.677846244204565,
 -inf,
 -5.552340986880593,
 -7.999832843634663,
 -9.853923978622282,
 0,
 -8.075394215172222,
 -6.386686949460646,
 -5.438501585932792,
 -4.987595439069698,
 -3.953089322356667,
 -8.075384575035306,
 -inf,
 0,
 -8.199709904254176,
 -inf,
 -10.516654798246318,
 0,
 -4.567625782444035,
 5,
 -7.835815371846494,
 1,
 -4.691567903246553,
 -6.791690943582811,
 -8.512639496896666,
 -7.87623202024998,
 -5.4671174105889655,
 -7.891259963750091,
 -6.767786208227923,
 -8.924018212329587,
 -8.237162889211437,
 -8.617352647072467,
 -13.60120657469366,
 -6.306148670262413,
 -7.819587781838355,
 -10.42407538206976,
 -6.162857867528065,
 -10.311111584483152,
 -6.34014968366858,
 -8.100425642082211,
 -8.375370596575328,
 -9.921350268163124,
 -10.487614911910729,
 -7.363242555871257,
 -8.55392245119615,
 -8.207960479608374,
 -7.777596203159332,
 -7.082120440593117,
 -7.298102385579864,
 -7.115961934607281,
 -7.596311363141728,
 0,
 -inf,
 -inf,
 -inf,
 0.0,
 0,
 0.0,
 -inf,
 1.7411295468056076,
 1.8269026656007323,
 1.790335880924894,
 0,
 -inf,
 -inf,
 -inf,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 -inf,
 -inf,
 -inf,
 0.0,
 0,
 0.0,
 -inf,
 1.7411295468056076,
 1.8269026656007323,
 1.790335880924894,
 -inf,
 -inf,
 -inf,
 -inf,
 -inf,
 -inf,
 0,
 0,
 -inf,
 0,
 -inf,
 0,
 0,
 -4.610149476789775,
 -inf,
 -inf,
 0.0,
 -inf,
 0.0,
 -inf,
 1.7411295468056076,
 1.8269026656007323,
 1.790335880924894,
 -inf,
 -inf,
 -inf,
 -inf,
 -inf,
 -inf,
 0,
 -inf,
 -inf,
 0,
 -inf,
 -inf,
 0,
 -2.5644559444400956,
 -inf,
 -inf,
 0.0,
 -inf,
 0.0,
 -inf,
 1.7411295468056076,
 1.8269026656007323,
 1.790335880924894,
 -inf,
 -inf,
 -inf,
 -inf,
 -inf,
 -inf,
 0,
 -inf,
 -inf,
 0,
 -inf,
 -inf,
 0,
 -1.9678147153196068,
 -inf,
 -inf,
 0.0,
 -inf,
 0.0,
 -inf,
 1.7411295468056076,
 1.8269026656007323,
 1.790335880924894,
 -inf,
 -inf,
 -inf,
 -inf,
 -inf,
 -inf,
 0,
 -inf,
 -inf,
 0,
 -inf,
 -inf,
 0,
 0.22988953758400052,
 -inf,
 -inf,
 0.0,
 -inf,
 0.0,
 -inf,
 1.7411295468056076,
 1.8269026656007323,
 1.790335880924894,
 -inf,
 -inf,
 -inf,
 -inf,
 -inf,
 -inf,
 -inf,
 -inf,
 -inf,
 0,
 -inf,
 -inf,
 0,
 11.512925464970229,
 2011,
 1,
 0.0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]
In [74]:
for i in train_data.columns:
    train_data.loc[(train_data[i]==min(train_data[i])),i]=np.quantile(train_data[i],1)
In [75]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
train_data.loc[:,scaling_feature] = scaler.fit_transform(train_data.loc[:,scaling_feature])
In [76]:
scaling_feature_test=[feature for feature in test_data.columns if feature not in ['id'] ]
len(scaling_feature_test)
Out[76]:
339
In [77]:
for i in test_data.columns:
    test_data.loc[(test_data[i]==min(test_data[i])),i]=np.quantile(test_data[i],1)
In [78]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
test_data.loc[:,scaling_feature_test] = scaler.fit_transform(test_data.loc[:,scaling_feature_test])
In [79]:
test_data.head()
Out[79]:
id full_sq life_sq floor max_floor material build_year num_room kitch_sq state product_type sub_area area_m raion_popul green_zone_part indust_part children_preschool preschool_quota preschool_education_centers_raion children_school school_quota school_education_centers_raion school_education_centers_top_20_raion hospital_beds_raion healthcare_centers_raion university_top_20_raion sport_objects_raion additional_education_raion culture_objects_top_25 culture_objects_top_25_raion shopping_centers_raion office_raion thermal_power_plant_raion incineration_raion oil_chemistry_raion radiation_raion railroad_terminal_raion big_market_raion nuclear_reactor_raion detention_facility_raion full_all male_f female_f young_all young_male young_female work_all work_male work_female ekder_all ekder_male ekder_female 0_6_all 0_6_male 0_6_female 7_14_all 7_14_male 7_14_female 0_17_all 0_17_male 0_17_female 16_29_all 16_29_male 16_29_female 0_13_all 0_13_male 0_13_female raion_build_count_with_material_info build_count_block build_count_wood build_count_frame build_count_brick build_count_monolith build_count_panel build_count_foam build_count_slag build_count_mix raion_build_count_with_builddate_info build_count_before_1920 build_count_1921-1945 build_count_1946-1970 build_count_1971-1995 build_count_after_1995 ID_metro metro_min_avto metro_km_avto metro_min_walk metro_km_walk kindergarten_km school_km park_km green_zone_km industrial_km water_treatment_km cemetery_km incineration_km railroad_station_walk_km railroad_station_walk_min ID_railroad_station_walk railroad_station_avto_km railroad_station_avto_min ID_railroad_station_avto public_transport_station_km public_transport_station_min_walk water_km water_1line mkad_km ttk_km sadovoe_km bulvar_ring_km kremlin_km big_road1_km ID_big_road1 big_road1_1line big_road2_km ID_big_road2 railroad_km railroad_1line zd_vokzaly_avto_km ID_railroad_terminal bus_terminal_avto_km ID_bus_terminal oil_chemistry_km nuclear_reactor_km radiation_km power_transmission_line_km thermal_power_plant_km ts_km big_market_km market_shop_km fitness_km swim_pool_km ice_rink_km stadium_km basketball_km hospice_morgue_km detention_facility_km public_healthcare_km university_km workplaces_km shopping_centers_km office_km additional_education_km preschool_km big_church_km church_synagogue_km mosque_km theater_km museum_km exhibition_km ... cafe_count_1000_price_4000 cafe_count_1000_price_high big_church_count_1000 church_count_1000 mosque_count_1000 leisure_count_1000 sport_count_1000 market_count_1000 green_part_1500 prom_part_1500 office_count_1500 office_sqm_1500 trc_count_1500 trc_sqm_1500 cafe_count_1500 cafe_sum_1500_min_price_avg cafe_sum_1500_max_price_avg cafe_avg_price_1500 cafe_count_1500_na_price cafe_count_1500_price_500 cafe_count_1500_price_1000 cafe_count_1500_price_1500 cafe_count_1500_price_2500 cafe_count_1500_price_4000 cafe_count_1500_price_high big_church_count_1500 church_count_1500 mosque_count_1500 leisure_count_1500 sport_count_1500 market_count_1500 green_part_2000 prom_part_2000 office_count_2000 office_sqm_2000 trc_count_2000 trc_sqm_2000 cafe_count_2000 cafe_sum_2000_min_price_avg cafe_sum_2000_max_price_avg cafe_avg_price_2000 cafe_count_2000_na_price cafe_count_2000_price_500 cafe_count_2000_price_1000 cafe_count_2000_price_1500 cafe_count_2000_price_2500 cafe_count_2000_price_4000 cafe_count_2000_price_high big_church_count_2000 church_count_2000 mosque_count_2000 leisure_count_2000 sport_count_2000 market_count_2000 green_part_3000 prom_part_3000 office_count_3000 office_sqm_3000 trc_count_3000 trc_sqm_3000 cafe_count_3000 cafe_sum_3000_min_price_avg cafe_sum_3000_max_price_avg cafe_avg_price_3000 cafe_count_3000_na_price cafe_count_3000_price_500 cafe_count_3000_price_1000 cafe_count_3000_price_1500 cafe_count_3000_price_2500 cafe_count_3000_price_4000 cafe_count_3000_price_high big_church_count_3000 church_count_3000 mosque_count_3000 leisure_count_3000 sport_count_3000 market_count_3000 green_part_5000 prom_part_5000 office_count_5000 office_sqm_5000 trc_count_5000 trc_sqm_5000 cafe_count_5000 cafe_sum_5000_min_price_avg cafe_sum_5000_max_price_avg cafe_avg_price_5000 cafe_count_5000_na_price cafe_count_5000_price_500 cafe_count_5000_price_1000 cafe_count_5000_price_1500 cafe_count_5000_price_2500 cafe_count_5000_price_4000 cafe_count_5000_price_high big_church_count_5000 church_count_5000 mosque_count_5000 leisure_count_5000 sport_count_5000 market_count_5000 year month day life_sqnan build_yearnan statenan preschool_quotanan school_quotanan hospital_beds_raionnan raion_build_count_with_material_infonan build_count_blocknan build_count_woodnan build_count_framenan build_count_bricknan build_count_monolithnan build_count_panelnan build_count_foamnan build_count_slagnan build_count_mixnan raion_build_count_with_builddate_infonan build_count_before_1920nan build_count_1921-1945nan build_count_1946-1970nan build_count_1971-1995nan build_count_after_1995nan metro_min_walknan metro_km_walknan railroad_station_walk_kmnan railroad_station_walk_minnan ID_railroad_station_walknan cafe_sum_500_min_price_avgnan cafe_sum_500_max_price_avgnan cafe_avg_price_500nan cafe_sum_1000_min_price_avgnan cafe_sum_1000_max_price_avgnan cafe_avg_price_1000nan cafe_sum_1500_min_price_avgnan cafe_sum_1500_max_price_avgnan cafe_avg_price_1500nan green_part_2000nan cafe_sum_2000_min_price_avgnan cafe_sum_2000_max_price_avgnan cafe_avg_price_2000nan cafe_sum_3000_min_price_avgnan cafe_sum_3000_max_price_avgnan cafe_avg_price_3000nan prom_part_5000nan cafe_sum_5000_min_price_avgnan cafe_sum_5000_max_price_avgnan cafe_avg_price_5000nan
0 38135 -0.720188 -0.425838 -0.877825 -0.581216 0.531818 0.187170 1.295149 -0.222979 0.146649 0.0 0.0 0.362943 1.579349 -0.496943 -0.722049 2.249692 4.116125 1.104928 2.321928 4.294872 1.307487 -3.093263 -0.148247 -1.300880 0.326089 0.154780 -0.375633 0.0 0.265746 -0.539118 -0.682446 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 -0.124467 -0.121389 -0.127027 2.289409 2.386136 2.181858 1.758536 1.635214 1.873301 0.456361 0.326359 0.512294 2.249692 2.358689 2.129094 2.321928 2.413747 2.216802 2.281233 2.374503 2.177963 -0.161461 -0.166827 -0.153458 2.289390 2.396009 2.170917 3.698658 2.062212 0.705857 -1.693016 0.391978 2.414051 2.940625 -5.415450 0.782686 -1.051917 3.698107 -1.181503 1.074286 1.343795 0.275823 3.635613 -0.466868 -0.553255 -0.479885 -0.435708 -0.435708 -0.367592 -0.192589 -0.233743 -0.758421 -0.065234 -1.636107 -0.864453 -0.146032 0.171800 0.171800 -0.018409 0.155498 0.153153 -0.191873 -0.215017 -0.215017 -0.937407 0.0 -0.032554 1.051621 1.082121 1.102163 0.678459 1.866258 -1.030354 0.0 1.273101 0.566921 -0.611995 0.0 1.068773 -1.078169 -0.256843 0.253219 0.425039 0.738345 0.345405 0.131329 0.287966 0.808903 0.242382 0.413551 -0.467709 -0.054884 1.073371 1.493166 0.745606 0.104272 0.962672 -0.378084 1.490326 0.654183 0.257094 0.930275 -0.363237 -0.197951 -0.473754 -0.207520 -1.046394 1.402244 1.142573 1.296132 ... 0.373905 0.224449 -1.516908 -0.951125 0.0 0.457361 -0.698101 0.614724 -0.007653 -0.923969 1.296533 1.314706 1.740068 1.765686 -0.469029 0.909483 0.538555 0.684330 1.060684 1.504421 2.105188 -0.710710 1.082391 0.537703 0.272655 -1.001678 -0.491847 0.0 0.633197 -0.762432 0.887953 -0.430031 -0.983962 1.533960 1.540772 2.092902 2.303388 -0.397717 0.753228 0.420868 0.550597 1.332940 1.994457 2.341679 -0.532825 1.337040 0.749555 0.316944 -0.807555 -0.390294 0.0 0.773571 -0.735301 1.110466 -0.637304 -1.043258 1.871263 1.824614 -0.875123 -0.841788 -0.379226 0.179678 -0.085962 0.017290 -0.738505 -0.546056 -0.502142 -0.415542 1.746133 1.096697 0.414079 -0.661712 -0.391445 -2.416831 1.030214 -0.831028 1.485452 -0.145386 -1.056496 -0.562970 -0.677949 -1.002645 -0.983062 -0.518350 -0.650138 -0.893287 -0.799249 -0.650806 -0.545564 -0.591706 -0.488643 3.044759 2.041240 0.548158 -0.566298 -0.475511 -1.587810 1.440184 -0.936058 -1.271634 0.0 -0.071315 1.644608 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 30475 1.242129 -0.083525 -0.213539 0.339382 0.531818 0.255150 -0.694237 -0.241597 1.313446 0.0 0.0 0.335827 -1.483901 1.515007 -0.945628 -1.338624 -0.130403 1.623708 -1.348669 -0.250098 1.552875 0.323283 -0.148247 1.124855 0.326089 1.832181 1.685920 0.0 0.265746 -0.931349 1.550957 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 -0.437313 -0.438222 -0.436383 -1.349119 -1.335979 -1.360330 -1.460973 -1.468277 -1.445139 -1.498586 -1.460280 -1.507933 -1.338624 -1.322969 -1.352238 -1.348669 -1.338617 -1.354803 -1.356053 -1.344436 -1.365700 -0.438010 -0.438158 -0.438357 -1.349680 -1.336207 -1.360952 -0.319273 -0.340464 -0.924861 0.530194 -0.454359 -0.449821 -0.351987 0.189085 0.782686 0.467651 -0.318451 0.793199 -0.767822 -0.274819 -0.338500 -0.365554 -0.894191 -0.072462 -0.029076 -0.012167 -0.012167 0.030438 -0.057449 0.367951 4.013384 -0.325963 0.500734 0.019043 0.958208 0.338201 0.338201 -0.500349 0.315782 0.293118 -0.599768 0.215116 0.215116 0.398599 0.0 0.052287 0.732605 0.722609 0.734415 0.413785 0.266975 -0.339099 0.0 0.592906 -0.054341 0.285373 0.0 1.056135 -0.559623 0.833124 -0.015204 1.062965 0.848426 0.665156 0.293251 0.466983 -0.202339 0.195739 1.188617 0.018733 0.054386 -0.326146 0.579614 0.609975 1.172162 1.081739 0.100419 0.813163 0.954658 0.051790 0.548483 -0.044727 -0.059619 -0.191324 0.100051 0.814213 0.460533 0.431872 -0.144518 ... 0.373905 0.224449 0.700431 1.111609 0.0 0.457361 1.577157 0.614724 2.436839 -0.682165 1.296533 1.314706 1.740068 1.765686 -0.464624 0.295729 0.538555 0.444234 1.060684 1.504421 -0.561746 1.530591 -0.987468 0.537703 0.272655 1.054297 -0.533294 0.0 0.633197 -1.050444 0.887953 2.010009 -0.776539 1.533960 1.540772 -0.928094 -0.704428 -0.384394 -0.069406 -0.041665 -0.052826 1.332940 -0.588202 -0.517802 -0.542439 -0.807938 0.749555 0.316944 -0.807555 -0.390294 0.0 0.773571 -0.988001 1.110466 1.182109 -0.992953 1.871263 1.824614 -0.933155 -0.909670 -0.384456 -0.345776 -0.175224 -0.241776 1.587716 -0.552448 -0.471351 -0.454169 -0.649666 1.096697 0.414079 -0.661712 -0.328256 0.413765 1.030214 -0.831028 1.485452 1.453722 -0.560321 -0.558772 -0.638758 -1.071682 -1.040531 -0.516717 -0.247162 -0.161538 -0.195068 -0.650806 -0.551490 -0.567129 -0.515133 -0.504662 -0.592727 0.548158 -0.543422 -0.457269 0.629798 -0.822869 -0.971107 -1.271634 0.0 -0.071315 1.644608 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 30476 -0.646967 -0.270562 -0.767110 -1.041515 -2.399684 0.064159 -0.818574 -0.232642 -1.020148 0.0 0.0 -0.347291 0.894814 -0.903506 0.492785 0.253439 -0.528036 0.067370 0.230682 0.003536 -0.164841 0.323283 0.544834 -0.330586 0.326089 0.154780 1.685920 0.0 0.265746 -0.800605 -0.633540 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 -0.369586 -0.374809 -0.365058 0.242451 0.213153 0.273095 0.957094 0.861570 1.048313 1.048518 0.885588 1.116125 0.253439 0.220367 0.288118 0.230682 0.210944 0.250743 0.248909 0.221422 0.277602 -0.363491 -0.375945 -0.351286 0.243925 0.216166 0.272868 0.451845 0.907438 1.206374 0.530194 0.439288 -0.554917 0.590147 0.189085 -1.335251 0.467651 0.452960 0.793199 -0.749215 1.332822 -0.236113 -0.318665 -0.484673 -0.500450 -0.415593 -0.375304 -0.375304 -0.372301 -0.320054 -0.114906 0.495733 -0.236976 -0.108440 0.710064 -0.197488 -0.133154 -0.133154 0.913342 -0.074933 -0.062160 0.351986 -0.213475 -0.213475 1.815171 0.0 -0.548335 -0.620620 -0.623805 -0.660199 -0.573558 -0.984264 -0.527623 0.0 0.114717 1.696487 0.226541 0.0 -0.607469 1.485754 -0.321478 -1.357320 -1.526046 0.507795 -0.616638 -0.214814 -0.571856 -0.273547 0.340189 -0.888731 -0.136464 -0.503585 -0.574268 -0.995849 -0.548705 -0.686932 -0.697363 -0.577568 -0.261485 -0.104027 -0.045401 -0.288713 0.126843 -0.328428 -0.274565 0.114893 0.399420 -0.897984 -0.809948 -0.787819 ... 0.373905 0.224449 0.700431 1.111609 0.0 -2.349895 -0.271490 -1.163312 0.668735 -0.501730 -0.851352 -0.880011 -0.885156 1.765686 -0.394142 -0.358901 -0.379051 -0.372144 -1.021822 -0.675329 -0.497867 -0.682517 -0.967941 0.537703 0.272655 -1.001678 -0.533294 0.0 -1.672948 0.005601 -0.586660 1.411631 -0.311998 -0.697959 -0.780531 -0.745004 -0.693237 -0.314451 -0.488785 -0.458505 -0.471186 -0.828571 -0.533252 -0.423533 -0.503980 -0.795467 0.749555 0.316944 -0.775026 -0.390294 0.0 -1.243136 -0.166727 -0.183291 1.648513 -0.349047 -0.602747 -0.672130 -0.701027 -0.886902 -0.313844 -0.715637 -0.602578 -0.647572 -0.738505 -0.507702 -0.371281 -0.415542 -0.640344 1.096697 0.414079 -0.636512 -0.391445 0.413765 -0.942915 -0.294473 -0.298954 0.223358 0.410600 -0.453828 -0.568485 -0.381312 -0.368899 -0.257125 -0.703292 -0.694517 -0.698552 -0.600535 -0.261104 -0.223047 -0.263478 -0.409503 -0.537852 0.548158 -0.360410 -0.274848 0.629798 -0.627028 0.062841 0.314596 0.0 -0.071315 1.644608 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 30477 0.441582 0.114099 0.782889 0.339382 0.531818 0.245439 -0.818574 -0.095953 0.146649 0.0 0.0 0.158707 -1.429040 0.202462 -0.876407 -1.282999 -0.130403 1.623708 -1.297228 -0.250098 1.552875 0.323283 -0.148247 1.124855 0.326089 1.832181 -0.719225 0.0 0.265746 1.945014 1.550957 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 -0.467691 -0.469002 -0.466432 -1.295325 -1.281853 -1.306751 -1.406720 -1.411452 -1.393801 -1.446508 -1.412660 -1.454015 -1.282999 -1.267095 -1.296988 -1.297228 -1.286981 -1.303228 -1.301647 -1.289777 -1.311887 -0.466179 -0.467595 -0.465676 -1.295813 -1.282128 -1.307431 -0.319273 -0.340464 -0.924861 0.530194 -0.454359 -0.449821 -0.351987 0.189085 0.782686 0.467651 -0.318451 0.793199 -0.767822 -0.274819 -0.338500 -0.365554 -0.466868 0.526366 0.402678 0.342570 0.342570 0.744120 0.451979 0.678120 -0.845483 -0.481173 -1.056021 0.294800 0.419513 1.581426 1.581426 -0.018409 1.675431 1.885978 -0.191873 1.584831 1.584831 -0.974055 0.0 0.425569 1.342953 1.345771 1.347650 0.867355 0.725303 1.231935 0.0 2.792327 -1.466299 1.319367 0.0 2.017605 -0.559623 0.456431 -0.015204 0.825626 1.216489 0.812743 0.549925 1.013913 1.133013 -0.324912 2.006252 1.435504 0.884850 1.716821 1.252237 1.274876 1.097910 1.396909 0.533293 1.230477 1.235153 1.576179 1.977498 -0.393120 0.461845 0.952853 -0.828824 -0.282956 1.153889 1.609682 1.650187 ... 0.373905 0.224449 0.700431 -0.890456 0.0 0.457361 1.577157 0.614724 0.534457 -0.772382 1.296533 1.314706 1.740068 1.765686 -0.473434 5.205693 5.257657 5.246327 1.060684 -0.740591 2.105188 1.530591 1.082391 0.537703 0.272655 1.054297 -0.450399 0.0 0.633197 1.829679 0.887953 0.728076 -0.853602 1.533960 1.540772 2.092902 2.303388 -0.404378 4.705062 4.737761 4.730574 1.332940 -0.588202 2.341679 2.130475 1.337040 0.749555 0.316944 1.339376 -0.354867 0.0 0.773571 2.297097 1.110466 0.842573 -1.214296 1.871263 1.824614 2.606797 2.593685 -0.407993 5.606113 5.716608 5.684703 1.587716 -0.552448 2.884863 2.975891 1.746133 1.096697 0.414079 1.782651 -0.359850 0.413765 1.030214 2.495614 1.485452 0.098314 -1.555968 2.690303 2.739924 2.759871 2.646635 -0.541207 5.611585 5.493050 5.543491 2.214667 -0.569269 3.328373 -0.535001 -0.504662 -0.574435 0.548158 2.842292 -0.475511 0.629798 1.440184 -1.146353 1.900826 0.0 -0.071315 1.644608 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 30478 -0.671374 0.255259 0.782889 0.339382 0.531818 0.255150 1.295149 -0.241597 1.313446 0.0 0.0 0.335827 -1.483901 1.515007 -0.945628 -1.338624 -0.130403 1.623708 -1.348669 -0.250098 1.552875 0.323283 -0.148247 1.124855 0.326089 1.832181 1.685920 0.0 0.265746 -0.931349 1.550957 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 -0.437313 -0.438222 -0.436383 -1.349119 -1.335979 -1.360330 -1.460973 -1.468277 -1.445139 -1.498586 -1.460280 -1.507933 -1.338624 -1.322969 -1.352238 -1.348669 -1.338617 -1.354803 -1.356053 -1.344436 -1.365700 -0.438010 -0.438158 -0.438357 -1.349680 -1.336207 -1.360952 -0.319273 -0.340464 -0.924861 0.530194 -0.454359 -0.449821 -0.351987 0.189085 0.782686 0.467651 -0.318451 0.793199 -0.767822 -0.274819 -0.338500 -0.365554 -0.894191 -0.408629 -0.315732 -0.281484 -0.281484 -0.074746 -0.080142 0.410462 0.125178 -0.544857 0.604902 0.022200 1.048473 -0.105494 -0.105494 -0.500349 -0.138801 -0.203974 -0.599768 0.096659 0.096659 -0.583196 0.0 0.101687 0.753227 0.743037 0.754332 0.429647 0.793468 -0.339099 0.0 0.213926 -0.054341 -0.012711 0.0 0.890779 -0.559623 0.891355 -0.015204 1.113917 0.889262 0.746749 0.407849 0.477322 -0.187772 0.249785 0.712270 0.111053 -0.275717 -0.653102 0.639246 0.613893 1.258119 0.929203 -0.280762 0.877478 0.921405 0.065145 0.737480 -0.140728 -0.082848 -0.338075 0.123798 0.888668 0.522004 0.469048 -0.102337 ... 0.373905 0.224449 0.700431 1.111609 0.0 0.457361 1.577157 0.614724 1.473157 -0.879385 1.296533 1.314706 1.740068 1.765686 -0.464624 0.295729 0.538555 0.444234 1.060684 1.504421 -0.561746 1.530591 -0.987468 0.537703 0.272655 -1.001678 -0.533294 0.0 0.633197 1.829679 0.887953 1.247721 -0.844239 1.533960 1.540772 -0.836549 -0.683500 -0.384394 -0.069406 -0.041665 -0.052826 1.332940 -0.588202 -0.517802 -0.542439 -0.807938 0.749555 0.316944 -0.807555 -0.390294 0.0 0.773571 -0.861651 1.110466 1.339044 -1.128218 1.871263 1.824614 -0.933155 -0.909670 -0.387071 -0.242564 -0.102492 -0.157053 1.587716 -0.552448 -0.479049 -0.454169 -0.649666 1.096697 0.414079 -0.661712 -0.359850 0.413765 1.030214 -0.866798 1.485452 1.136091 -0.682304 -0.562970 -0.655584 -1.140719 -1.065936 -0.516717 -0.147658 -0.102007 -0.119959 -0.667563 -0.551490 -0.567129 -0.508511 -0.504662 -0.592727 0.548158 -0.543422 -0.439027 0.629798 -0.822869 -0.988632 -1.271634 0.0 -0.071315 1.644608 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 340 columns

FEATURE SELECTION

In [80]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
In [81]:
train_data.drop("id",axis=1,inplace=True)
In [82]:
train_data_new=train_data.drop(['price_doc'],axis=1)
In [83]:
y_train=train_data[['price_doc']]
In [84]:
feature_sel_model = SelectFromModel(Lasso(alpha=0.005, random_state=123)) # remember to set the seed, the random state in this function
feature_sel_model.fit(train_data_new, y_train)
Out[84]:
SelectFromModel(estimator=Lasso(alpha=0.005, copy_X=True, fit_intercept=True,
                                max_iter=1000, normalize=False, positive=False,
                                precompute=False, random_state=123,
                                selection='cyclic', tol=0.0001,
                                warm_start=False),
                max_features=None, norm_order=1, prefit=False, threshold=None)
In [85]:
selected_feat = train_data_new.columns[(feature_sel_model.get_support())]

# let's print some stats
print('total features: {}'.format((train_data_new.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('features with coefficients shrank to zero: {}'.format(np.sum(feature_sel_model.estimator_.coef_ == 0)))
total features: 343
selected features: 120
features with coefficients shrank to zero: 223
In [86]:
selected_feat
Out[86]:
Index(['full_sq', 'life_sq', 'max_floor', 'material', 'build_year', 'num_room',
       'kitch_sq', 'state', 'sub_area', 'preschool_quota',
       ...
       'cafe_sum_5000_max_price_avg', 'cafe_count_5000_price_500',
       'cafe_count_5000_price_1000', 'cafe_count_5000_price_1500',
       'cafe_count_5000_price_2500', 'cafe_count_5000_price_4000',
       'cafe_count_5000_price_high', 'leisure_count_5000', 'sport_count_5000',
       'year'],
      dtype='object', length=120)
In [87]:
train_data_new=train_data_new[selected_feat]
In [88]:
test_data.to_csv("test123.csv")
In [89]:
test_data_new=pd.read_csv("test123.csv",usecols=selected_feat)
In [90]:
test_data_new.shape
Out[90]:
(7662, 120)
In [91]:
train_data_new.shape
Out[91]:
(30471, 120)
In [92]:
from sklearn.model_selection import train_test_split
In [93]:
xtrain,xtest,ytrain,ytest=train_test_split(train_data_new,y_train,test_size=0.3,random_state=123)
In [94]:
xtrain.shape,xtest.shape,ytrain.shape,ytest.shape
Out[94]:
((21329, 120), (9142, 120), (21329, 1), (9142, 1))
In [95]:
ytrain=np.array(ytrain)
ytest = np.array(ytest)
In [96]:
def model_builder(model):
    m = model
    m.fit(xtrain,ytrain) # log(y) follows normal distribution
    train_pred =np.exp(m.predict(xtrain))
    test_pred = np.exp(m.predict(xtest))
    print('========Train=======')
    print('RMSE :' ,np.sqrt(mean_squared_error(ytrain,train_pred)))
    print('MAPE :' , np.mean(np.abs((ytrain-train_pred)/ytrain))*100)
    print('========Test=======')
    print('RMSE :' ,np.sqrt(mean_squared_error(ytest,test_pred)))
    print('MAPE :' , np.mean(np.abs((ytest-test_pred)/ytest))*100)
    return m
In [97]:
train_data_new.fillna(0,inplace=True)
In [98]:
np.where(train_data_new.values >= np.finfo(np.float64).max)
Out[98]:
(array([], dtype=int64), array([], dtype=int64))
In [99]:
lr=model_builder(LinearRegression())
========Train=======
RMSE : 1.9566156934714765
MAPE : 1173.0115037051728
========Test=======
RMSE : 2.168163042526417
MAPE : 892.5329576393402
In [100]:
xg = model_builder(XGBRegressor())
========Train=======
RMSE : 2.4013148581991
MAPE : 1673.4218221945025
========Test=======
RMSE : 2.185605323077104
MAPE : 1338.9667586534404
In [104]:
xg = model_builder(XGBRegressor())
========Train=======
RMSE : 2.4013148581991
MAPE : 1673.4218221945025
========Test=======
RMSE : 2.185605323077104
MAPE : 1338.9667586534404
In [105]:
params ={'max_depth':range(1,15), 'min_samples_split':range(10,70,10)}
 
dt = model_builder(GridSearchCV(DecisionTreeRegressor( ) , param_grid=params , cv=2 , n_jobs=-1))
========Train=======
RMSE : 1.7590339520454807
MAPE : 1473.3109878439675
========Test=======
RMSE : 1.8057512588993503
MAPE : 1238.9236506498446
In [106]:
params = {'n_estimators':range(50,200,50),'learning_rate':[0.5,0.7,1.0]}
ad = model_builder(GridSearchCV(AdaBoostRegressor() , param_grid=params , cv=2 ,n_jobs=-1))
========Train=======
RMSE : 1.0519314655659922
MAPE : 645.4961496920954
========Test=======
RMSE : 1.081722131068907
MAPE : 544.9429240394156
In [107]:
gb = model_builder(GradientBoostingRegressor(n_estimators=50 , learning_rate=0.1))
========Train=======
RMSE : 1.602944277401274
MAPE : 1432.9728593461186
========Test=======
RMSE : 1.626682221055294
MAPE : 1198.0175654116974
In [108]:
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
In [109]:
clf=model_builder(Ridge())
========Train=======
RMSE : 1.9563973670194497
MAPE : 1172.9959597401967
========Test=======
RMSE : 2.167822260676223
MAPE : 892.5289066706323
In [110]:
clf2=model_builder(Lasso())
========Train=======
RMSE : 1.4105768207813816
MAPE : 1169.3184292169735
========Test=======
RMSE : 1.43518690681185
MAPE : 983.7045381218818
In [111]:
rf123=model_builder(RandomForestRegressor())
========Train=======
RMSE : 2.1293767687410385
MAPE : 1672.4426218161948
========Test=======
RMSE : 1.845423825621202
MAPE : 1265.5462614173314
In [112]:
sub=pd.read_csv("sample_submission.csv/sample_submission.csv")
In [116]:
sub.head()
Out[116]:
id price_doc
0 30474 0.383503
1 30475 1.256324
2 30476 0.514588
3 30477 0.596932
4 30478 0.491853
In [113]:
test_prediction = np.exp(rf123.predict(test_data_new))
In [114]:
sub['price_doc']=test_prediction
In [115]:
sub.to_csv('final_submission_rf34.csv',index=False)
In [ ]: